In [1]:
# Import necessary libraries
import pandas as pd  # Import pandas library for data manipulation
from sklearn.model_selection import train_test_split  # Import train_test_split for splitting the dataset
from sklearn.preprocessing import StandardScaler  # Import StandardScaler for feature scaling
from imblearn.over_sampling import SMOTE  # Import SMOTE for dataset balancing
from sklearn.linear_model import LogisticRegression  # Import LogisticRegression for logistic regression model
from xgboost import XGBClassifier  # Import XGBClassifier for XGBoost model
from sklearn.metrics import matthews_corrcoef, precision_score, recall_score, accuracy_score, f1_score, roc_auc_score  # Import evaluation metrics


ModuleNotFoundError: No module named 'pandas'

In [None]:
# Load the dataset with headers from the first row
data = pd.read_csv('/content/dataset of Water.txt', header=0)

In [None]:
data.head()

Unnamed: 0,Pipe ID,Material,Diameter,Length,Age,QCE,BP,RC,PE,Failure
0,1001,PE,400,600,8,1,0,0,0,0
1,1002,PCCP,1000,1500,20,4,1,3,1,1
2,1003,PE,500,700,12,2,0,1,0,0
3,1004,PCCP,900,1400,18,3,1,2,1,1
4,1005,PE,300,500,6,1,0,0,0,0


In [None]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pipe ID,366.0,1183.5,105.799338,1001.0,1092.25,1183.5,1274.75,1366.0
Diameter,366.0,740.983607,255.730198,300.0,500.0,750.0,950.0,1200.0
Length,366.0,1090.983607,389.855069,500.0,700.0,1075.0,1450.0,1700.0
Age,366.0,14.598361,5.280089,6.0,10.0,15.0,19.0,24.0
QCE,366.0,3.131148,1.590817,1.0,2.0,3.0,4.0,6.0
BP,366.0,0.5,0.500684,0.0,0.0,0.5,1.0,1.0
RC,366.0,1.745902,1.627982,0.0,0.0,1.5,3.0,5.0
PE,366.0,0.5,0.500684,0.0,0.0,0.5,1.0,1.0
Failure,366.0,0.5,0.500684,0.0,0.0,0.5,1.0,1.0


In [None]:
# Extract the feature names from the dataset
csv_feature_names = data.columns.tolist()  # Convert the column names to a list

In [None]:
csv_feature_names

['Pipe ID',
 'Material',
 'Diameter',
 'Length',
 'Age',
 'QCE',
 'BP',
 'RC',
 'PE',
 'Failure']

In [None]:
# Now you can proceed with the rest of your code using the features

# Data Preprocessing
X = data.drop(columns=['Failure'])  # Features: Drop the target variable 'Failure'
y = data['Failure']  # Target variable: 'Failure'


In [None]:
X.head()

Unnamed: 0,Pipe ID,Material,Diameter,Length,Age,QCE,BP,RC,PE
0,1001,PE,400,600,8,1,0,0,0
1,1002,PCCP,1000,1500,20,4,1,3,1
2,1003,PE,500,700,12,2,0,1,0
3,1004,PCCP,900,1400,18,3,1,2,1
4,1005,PE,300,500,6,1,0,0,0


In [None]:
y.head()

0    0
1    1
2    0
3    1
4    0
Name: Failure, dtype: int64

In [None]:
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # 80% training, 20% testing

In [None]:
# Identify non-numeric columns
non_numeric_columns = X_train.select_dtypes(include=['object']).columns  # Identify non-numeric columns

In [None]:
# Drop non-numeric columns before feature scaling
X_train_numeric = X_train.drop(columns=non_numeric_columns)  # Exclude non-numeric columns from training data
X_test_numeric = X_test.drop(columns=non_numeric_columns)  # Exclude non-numeric columns from test data

In [None]:
# Feature Scaling
scaler = StandardScaler()  # Initialize StandardScaler object
X_train_scaled = scaler.fit_transform(X_train_numeric)  # Fit and transform training data
X_test_scaled = scaler.transform(X_test_numeric)  # Transform test data using fitted scaler

In [None]:
# Dataset Balancing using SMOTE
smote = SMOTE(random_state=42)  # Initialize SMOTE object with a random state for reproducibility
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)  # Apply SMOTE to balance the training set

In [None]:
# Model Training and Evaluation
# Logistic Regression
logit_model = LogisticRegression()  # Initialize logistic regression model
logit_model.fit(X_train_balanced, y_train_balanced)  # Fit logistic regression model to balanced training data
logit_predictions = logit_model.predict(X_test_scaled)  # Make predictions on the test set using the logistic regression model

In [None]:
# XGBoost
xgb_model = XGBClassifier()  # Initialize XGBoost classifier
xgb_model.fit(X_train_balanced, y_train_balanced)  # Fit XGBoost model to balanced training data
xgb_predictions = xgb_model.predict(X_test_scaled)  # Make predictions on the test set using the XGBoost model

In [None]:
# Evaluation Metrics
# Calculate evaluation metrics for logistic regression predictions
logit_mcc = matthews_corrcoef(y_test, logit_predictions)
logit_precision = precision_score(y_test, logit_predictions)
logit_recall = recall_score(y_test, logit_predictions)
logit_accuracy = accuracy_score(y_test, logit_predictions)
logit_f1 = f1_score(y_test, logit_predictions)
logit_auc = roc_auc_score(y_test, logit_predictions)

In [None]:
# Calculate evaluation metrics for XGBoost predictions
xgb_mcc = matthews_corrcoef(y_test, xgb_predictions)
xgb_precision = precision_score(y_test, xgb_predictions)
xgb_recall = recall_score(y_test, xgb_predictions)
xgb_accuracy = accuracy_score(y_test, xgb_predictions)
xgb_f1 = f1_score(y_test, xgb_predictions)
xgb_auc = roc_auc_score(y_test, xgb_predictions)

In [None]:
# Print evaluation metrics for logistic regression
print("Logistic Regression Metrics:")
print("MCC:", logit_mcc)
print("Precision:", logit_precision)
print("Recall:", logit_recall)
print("Accuracy:", logit_accuracy)
print("F1 Score:", logit_f1)
print("AUC:", logit_auc)

Logistic Regression Metrics:
MCC: 1.0
Precision: 1.0
Recall: 1.0
Accuracy: 1.0
F1 Score: 1.0
AUC: 1.0


In [None]:
# Print evaluation metrics for XGBoost
print("\nXGBoost Metrics:")
print("MCC:", xgb_mcc)
print("Precision:", xgb_precision)
print("Recall:", xgb_recall)
print("Accuracy:", xgb_accuracy)
print("F1 Score:", xgb_f1)
print("AUC:", xgb_auc)


XGBoost Metrics:
MCC: 1.0
Precision: 1.0
Recall: 1.0
Accuracy: 1.0
F1 Score: 1.0
AUC: 1.0
