In [2]:
import pandas as pd 

In [3]:
df = pd.DataFrame(pd.read_csv("../data/metro.csv"))
df.head()

Unnamed: 0,id,timestamp,TP2,TP3,H1,DV_pressure,Reservoirs,Oil_temperature,Motor_current,COMP,DV_eletric,Towers,MPG,LPS,Pressure_switch,Oil_level,Caudal_impulses
0,6165210,2020-04-24 19:37:47,-0.012,9.148,9.136,-0.022,9.148,57.9,0.0425,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
1,12700550,2020-07-28 11:06:17,-0.016,9.852,9.84,-0.016,9.85,73.375,3.7,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
2,3067700,2020-03-13 11:29:05,-0.016,8.398,8.386,-0.022,8.398,56.425,0.0425,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
3,4048250,2020-03-26 06:23:04,-0.016,9.808,9.798,-0.024,9.804,68.35,3.8475,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
4,7413540,2020-05-15 04:54:54,-0.014,9.382,9.37,-0.024,9.38,57.775,0.045,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0


droping id and timestamp

In [3]:
drop_col=['id','timestamp']
df.drop(columns=drop_col, axis = 1, inplace = True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216945 entries, 0 to 216944
Data columns (total 17 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               216945 non-null  int64  
 1   timestamp        216945 non-null  object 
 2   TP2              216945 non-null  float64
 3   TP3              216945 non-null  float64
 4   H1               216945 non-null  float64
 5   DV_pressure      216945 non-null  float64
 6   Reservoirs       216945 non-null  float64
 7   Oil_temperature  216945 non-null  float64
 8   Motor_current    216945 non-null  float64
 9   COMP             216945 non-null  float64
 10  DV_eletric       216945 non-null  float64
 11  Towers           216945 non-null  float64
 12  MPG              216945 non-null  float64
 13  LPS              216945 non-null  float64
 14  Pressure_switch  216945 non-null  float64
 15  Oil_level        216945 non-null  float64
 16  Caudal_impulses  216945 non-null  floa

Segregate the independent and dependent features

In [5]:
X = df.drop(labels='Caudal_impulses', axis = 1)
Y = df['Caudal_impulses']
print(X) 
print(Y)

          TP2     TP3      H1  DV_pressure  Reservoirs  Oil_temperature  \
0      -0.012   9.148   9.136       -0.022       9.148           57.900   
1      -0.016   9.852   9.840       -0.016       9.850           73.375   
2      -0.016   8.398   8.386       -0.022       8.398           56.425   
3      -0.016   9.808   9.798       -0.024       9.804           68.350   
4      -0.014   9.382   9.370       -0.024       9.380           57.775   
...       ...     ...     ...          ...         ...              ...   
216940  9.316   9.476  -0.012        0.232       9.472           74.175   
216941 -0.010   9.842   9.832       -0.016       9.842           72.400   
216942 -0.022  10.026  10.020       -0.018      10.028           60.650   
216943 -0.010   9.624   9.610       -0.022       9.622           69.150   
216944  6.836   9.084  -0.014        1.186       9.084           71.150   

        Motor_current  COMP  DV_eletric  Towers  MPG  LPS  Pressure_switch  \
0              0.0425

# Segregating numerical and categorical columns

In [6]:
df.columns

Index(['TP2', 'TP3', 'H1', 'DV_pressure', 'Reservoirs', 'Oil_temperature',
       'Motor_current', 'COMP', 'DV_eletric', 'Towers', 'MPG', 'LPS',
       'Pressure_switch', 'Oil_level', 'Caudal_impulses'],
      dtype='object')

In [1]:
df.info()

NameError: name 'df' is not defined

In [7]:
num_col =['TP2', 'TP3', 'H1','DV_pressure', 'Reservoirs','Oil_temperature','Motor_current']
cat_col =['COMP', 'DV_eletric', 'Towers', 'MPG','LPS','Pressure_switch', 'Oil_level',]

We need simple imputer to fill the missing values and standard scaler to do feature scaling of numerical values

In [8]:
from sklearn.impute import SimpleImputer # Missing values
from sklearn.preprocessing import StandardScaler # Feature scaling 
from sklearn.preprocessing import OneHotEncoder # to encode categorical features which are not in rank
# Pipeline
from sklearn.pipeline import Pipeline #To add everything together 
from sklearn.compose import ColumnTransformer # Combine everything together

Creating Numerical Pipeline

In [9]:
num_pipe = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy= 'median')), 
        ('scaler', StandardScaler())
    ]
)

cat_pipe = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='most_frequent')), 
        ('encoder', OneHotEncoder(handle_unknown='ignore'))  # to encode categorical features which are not in rank
        
    ]

)

Combine both the pipelines

In [10]:
preprocessor = ColumnTransformer([
    ('num_pipe', num_pipe, num_col), 
    ('cat_pipe', cat_pipe, cat_col)
])

Train Test Split

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size= 0.25, random_state= 42)

In [12]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out()) 
X_test = pd.DataFrame(preprocessor.transform(X_test), columns = preprocessor.get_feature_names_out())

In [17]:
X_train.head()

Unnamed: 0,num_pipe__TP2,num_pipe__TP3,num_pipe__H1,num_pipe__DV_pressure,num_pipe__Reservoirs,num_pipe__Oil_temperature,num_pipe__Motor_current,cat_pipe__COMP_0.0,cat_pipe__COMP_1.0,cat_pipe__DV_eletric_0.0,...,cat_pipe__Towers_0.0,cat_pipe__Towers_1.0,cat_pipe__MPG_0.0,cat_pipe__MPG_1.0,cat_pipe__LPS_0.0,cat_pipe__LPS_1.0,cat_pipe__Pressure_switch_0.0,cat_pipe__Pressure_switch_1.0,cat_pipe__Oil_level_0.0,cat_pipe__Oil_level_1.0
0,-0.424246,-0.85283,0.257628,-0.192918,-0.851721,0.150567,-0.872213,0.0,1.0,1.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
1,2.160667,-0.749542,-2.275938,-0.182424,-0.75144,0.384577,1.531249,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
2,-0.424246,0.477386,0.512125,-0.198165,0.480132,-1.426124,-0.873299,0.0,1.0,1.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
3,-0.424246,0.089276,0.437697,-0.198165,0.088411,0.154403,-0.873299,0.0,1.0,1.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
4,-0.424246,0.186303,0.455704,-0.198165,0.182424,-0.179349,-0.871127,0.0,1.0,1.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0


In [18]:
X_test.head()

Unnamed: 0,num_pipe__TP2,num_pipe__TP3,num_pipe__H1,num_pipe__DV_pressure,num_pipe__Reservoirs,num_pipe__Oil_temperature,num_pipe__Motor_current,cat_pipe__COMP_0.0,cat_pipe__COMP_1.0,cat_pipe__DV_eletric_0.0,...,cat_pipe__Towers_0.0,cat_pipe__Towers_1.0,cat_pipe__MPG_0.0,cat_pipe__MPG_1.0,cat_pipe__LPS_0.0,cat_pipe__LPS_1.0,cat_pipe__Pressure_switch_0.0,cat_pipe__Pressure_switch_1.0,cat_pipe__Oil_level_0.0,cat_pipe__Oil_level_1.0
0,-0.424246,0.255162,0.470109,-0.203413,0.254501,-0.72793,-0.872213,0.0,1.0,1.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
1,-0.425476,1.356893,0.68139,-0.187671,1.354455,1.646697,0.7167,0.0,1.0,1.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
2,-0.425476,-0.918558,0.245024,-0.203413,-0.920664,-0.954268,-0.872213,0.0,1.0,1.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
3,-0.425476,1.288035,0.668785,-0.20866,1.282378,0.875614,0.780778,0.0,1.0,1.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
4,-0.424861,0.621362,0.540336,-0.20866,0.618018,-0.747111,-0.871127,0.0,1.0,1.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0


In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [33]:
import numpy as np
def model_evaluation(true, predicted):
    
   

    accuracy = accuracy_score(y_test, y_pred) 

    precision = precision_score(y_test, y_pred) 

    recall = recall_score(y_test, y_pred) 
 
    f1 = f1_score(y_test, y_pred) 
 


    return  accuracy, precision, recall, f1

Training multiple models 


In [34]:
models={
    'naive_bayes':GaussianNB(),
    'KNN':KNeighborsClassifier(n_neighbors=5),
    # 'svm':SVC(),
    'LogisticRegression':LogisticRegression(),
}
model_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    accuracy, precision, recall, f1 = model_evaluation(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    

    print('Model Training Performance')
    # print("Rconfusion matrix:",cm)
    print("accuracy:",accuracy)
    print("pricison",precision)
    print("recall",recall)
    print("f1 score :",f1)

   
    
    print('*'*35)
    print('\n')

naive_bayes
Model Training Performance
accuracy: 0.09819864668031049
pricison 0.9979736575481256
recall 0.03872083652731097
f1 score : 0.074549204366994
***********************************


KNN
Model Training Performance
accuracy: 0.9472684698637461
pricison 0.9511264773858961
recall 0.9949092910352418
f1 score : 0.9725253612050415
***********************************


LogisticRegression
Model Training Performance
accuracy: 0.9481165993694342
pricison 0.948039599530175
recall 0.9994693083318592
f1 score : 0.9730753774613927
***********************************




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
from sklearn.linear_model import LogisticRegression
lg_model = LogisticRegression()
lg_model = lg_model.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
y_pred = lg_model.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.95      0.17      0.29      3360
         1.0       0.95      1.00      0.97     50877

    accuracy                           0.95     54237
   macro avg       0.95      0.59      0.63     54237
weighted avg       0.95      0.95      0.93     54237



In [25]:
from sklearn.svm import SVC
svm_model = SVC()
svm_model = svm_model.fit(X_train,y_train)

In [27]:
y_pred = svm_model.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       1.00      0.17      0.29      3360
         1.0       0.95      1.00      0.97     50877

    accuracy                           0.95     54237
   macro avg       0.97      0.58      0.63     54237
weighted avg       0.95      0.95      0.93     54237

