In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statistics import mean

from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.impute import KNNImputer, SimpleImputer
import miceforest as mf
from imblearn.combine import SMOTETomek
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

In [None]:
df = pd.read_csv(r'https://raw.githubusercontent.com/avnyadav/sensor-fault-detection/main/aps_failure_training_set1.csv', na_values="na")

In [None]:
df

In [None]:
df.shape

In [None]:
df['class'].value_counts()

In [None]:
numeric_features = [feature for feature in df.columns if df[feature].dtype != 'O']
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']

print('We have {} numerical features : {}'.format(len(numeric_features), numeric_features))

print('We have {} categorical features : {}'.format(len(categorical_features),categorical_features))

In [None]:
# Plot missing values count for each column

fig,ax = plt.subplots(figsize=(15,5))

missing = df.isna().sum().div(df.shape[0]).mul(100).to_frame().sort_values(by=0, ascending=False)

ax.bar(missing.index, missing.values.T[0])
plt.xticks([])
plt.ylabel("Percentage Missing")
plt.show()

In [None]:
# Dropping columns which has more than 70% missing values

dropCols = missing[missing[0] > 70]
dropCols

In [None]:
df.drop(list(dropCols.index), axis=1, inplace=True)

In [None]:
df.shape

In [None]:
missing_values_count = df.isnull().sum()
total_cells = np.product(df.shape)
total_missing = missing_values_count.sum()

print("Percentage of total missing cells in the data {}%".format((total_missing/total_cells)*100))

In [None]:
pos = df[df['class'] == 'pos'].shape[0]
neg = df[df['class'] == 'neg'].shape[0]
print("Postitve: " + str(pos) + ", Negative: " + str(neg))
sns.catplot(data = df, x = "class", kind="count", alpha=0.6)
plt.show()

### Will use smote technique to handle oversampling of data that can be seen here

In [None]:
def evaluate_scores(true,predicted):
  
  acc = accuracy_score(true,predicted)
  f1 = f1_score(true,predicted)
  precision = precision_score(true,predicted)
  recall = recall_score(true,predicted)
  roc_auc = roc_auc_score(true,predicted)
  
  return acc,f1,precision,recall,roc_auc

In [None]:
def total_cost(y_true, y_pred):
  
  tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
  cost = 10*fp + 500*fn
  return cost 

In [None]:
def evaluate_models(X,y,models):
  
  X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
    
  cost_list=[]
  models_list = []    
  accuracy_list = []
    
  for i in range(len(list(models))):
      model = list(models.values())[i]
      model.fit(X_train, y_train)
        
      y_train_pred = model.predict(X_train)
      y_test_pred = model.predict(X_test)
        
      model_train_accuracy, model_train_f1,model_train_precision,\
      model_train_recall,model_train_rocauc_score=evaluate_scores(y_train ,y_train_pred)
      train_cost = total_cost(y_train, y_train_pred)

      model_test_accuracy,model_test_f1,model_test_precision,\
      model_test_recall,model_test_rocauc_score=evaluate_scores(y_test, y_test_pred)
      test_cost = total_cost(y_test, y_test_pred)

      print(list(models.keys())[i])
      models_list.append(list(models.keys())[i])

      print('Model performance for Training set')
      print("- Accuracy: {:.4f}".format(model_train_accuracy))
      print('- F1 score: {:.4f}'.format(model_train_f1)) 
      print('- Precision: {:.4f}'.format(model_train_precision))
      print('- Recall: {:.4f}'.format(model_train_recall))
      print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))
      print(f'- COST: {train_cost}.')

      print('Model performance for Test set')
      print('- Accuracy: {:.4f}'.format(model_test_accuracy))
      print('- F1 score: {:.4f}'.format(model_test_f1))
      print('- Precision: {:.4f}'.format(model_test_precision))
      print('- Recall: {:.4f}'.format(model_test_recall))
      print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))
      print(f'- COST: {test_cost}.')
      cost_list.append(test_cost)
      print('='*35)
        
  report=pd.DataFrame(list(zip(models_list, cost_list)), columns=['Model Name', 'Cost']).sort_values(by=["Cost"])
        
  return report
    

In [None]:
## This will plot 170 graphs as the number of numeric features and will provide us the distribution of values in each column
## Run at your own risk takes atleast 5 mins
## distplot in this is deprecated change to displot 

numeric_features = [feature for feature in df.columns if df[feature].dtype != 'O']

# plt.figure(figsize=(15,100))
# for i , col in enumerate(numeric_features):
#   plt.subplot(60,3,i+1)
#   sns.distplot(x=df[col], color = 'indianred')
#   plt.xlabel(col, weight='bold')
#   plt.tight_layout()

In [None]:
X = df.drop('class', axis=1)
y = df['class']

In [None]:
y = y.replace({'pos': 1, 'neg': 0})

## Stratergy 1

In [None]:
# Fit with robust scaler for KNN K-Selection

robust_scaler = RobustScaler()
X1 = robust_scaler.fit_transform(X)

This tries to fix the the empty values on the basis of neighbouring value
This code will take alot of time but is necessary for fixing the values
Run only once as will take hours to run this even on a good machine

In [None]:
# results = []

# imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
# strategies = [str(i) for i in [1,3,5,7]]
# for s in strategies:
#   pipeline = Pipeline(steps=[('i', KNNImputer(n_neighbors=int(s))), ('m', LogisticRegression())])
#   scores = cross_val_score(pipeline, X1, y, scoring='accuracy', cv=2, n_jobs=1)
#   results.append(scores)
#   print("n_neighbours: {} || accuracy_score: {:.4f}".format(int(s), mean(scores)))

In [None]:
# We have maximum accuracy when number of neighbours is 3 

num_features = X.select_dtypes(exclude="object").columns

knn_pipeline = Pipeline(steps=[
  ('imputer', KNNImputer(n_neighbors=3)),
  ('robustScaler', RobustScaler())
])

In [None]:
X_knn = knn_pipeline.fit_transform(X)

#### Will use Smote Tomek to remove overlapping data points also will create synthetic data for minority class and remove excess data from majority class

In [None]:
# Resampling minority class.

smt = SMOTETomek(random_state=42,sampling_strategy='minority', n_jobs=1)

X_res, y_res = smt.fit_resample(X_knn, y)

#### Create list of default models that can be used

In [None]:
models = {
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=2000),
     "K-Neighbors Classifier": KNeighborsClassifier(),
    "XGBClassifier": XGBClassifier(), 
     "CatBoosting Classifier": CatBoostClassifier(verbose=False),
    "AdaBoost Classifier": AdaBoostClassifier()
}

In [None]:
report_knn = evaluate_models(X_res, y_res, models)

In [None]:
report_knn

### Stratergy 2

In [None]:
num_features = X.select_dtypes(exclude="object").columns
num_features

median_pipeline = Pipeline(steps=[
  ('imputer', SimpleImputer(strategy='median')),
  ('robustScaler', RobustScaler())
])

In [None]:
X_median = median_pipeline.fit_transform(X)

In [None]:
smt = SMOTETomek(random_state=42, sampling_strategy='minority')

X_res, y_res = smt.fit_resample(X_median, y)

In [None]:
report_median = evaluate_models(X_res, y_res, models)

In [None]:
report_median

### Stratergy 3

In [None]:
X_mice = X.copy()
kernel = mf.ImputationKernel(
  X_mice,
  save_all_iterations=True,
  random_state=1999
)
kernel.mice(3)

In [None]:
X_mice = kernel.complete_data()

In [None]:
mice_pipeline = Pipeline(steps=[
  ('robustScaler', RobustScaler())
])

In [None]:
X_mice = mice_pipeline.fit_transform(X_mice)

In [None]:
smt = SMOTETomek(random_state=42, sampling_strategy='minority', n_jobs=-1)

X_res, y_res = smt.fit_resample(X_mice,y)

In [None]:
report_mice = evaluate_models(X_res, y_res, models)

In [None]:
report_mice

### Stratergy 4

In [None]:
constant_pipeline = Pipeline(steps=[
    ('Imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('RobustScaler', RobustScaler())
])

In [None]:
X_const =constant_pipeline.fit_transform(X)

In [None]:
smt = SMOTETomek(random_state=42,sampling_strategy='minority', n_jobs=-1 )

X_res, y_res = smt.fit_resample(X_const, y)

In [None]:
report_const = evaluate_models(X_res, y_res, models)

In [None]:
report_const

### Stratergy 5

In [None]:
mean_pipeline = Pipeline(steps=[
    ('Imputer', SimpleImputer(strategy='mean')),
    ('RobustScaler', RobustScaler())
])

In [None]:
X_mean = mean_pipeline.fit_transform(X)

In [None]:
smt = SMOTETomek(random_state=42,sampling_strategy='minority' , n_jobs=-1)

X_res, y_res = smt.fit_resample(X_mean, y)

In [None]:
report_mean = evaluate_models(X_res, y_res, models)

In [None]:
report_mean