In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import mutual_info_classif

from sklearn.metrics import accuracy_score, classification_report


from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data = pd.read_csv('/content/drive/MyDrive/finaldataframe.csv')
data.drop(index=0, inplace=True)
data.drop(inplace=True, columns = 'Unnamed: 0')


  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
data.head()

Unnamed: 0,Employee Code,Machine,Production,Order Operation No,Date,Time,Current,Humidity,Temperature,Flow,Job Temp,Voltage,Defect
1,382617,TWLD23,E15002965,240,2022-09-10 00:00:00,7:32:28:527,1.13,74,23,0.01,29.3,15.2,No Defect
2,382617,TWLD23,E15002965,240,2022-09-10 00:00:00,7:32:29:40,1.81999999999999,74,23,0.0,29.3,0.0,Tungsten Inclusion
3,382617,TWLD23,E15002965,240,2022-09-10 00:00:00,7:32:29:677,2.91,74,23,0.0,29.3,0.0,No Defect
4,382617,TWLD23,E15002965,240,2022-09-10 00:00:00,7:32:30:166,0.04,74,23,0.0,29.3,0.0,No Defect
5,382617,TWLD23,E15002965,240,2022-09-10 00:00:00,7:32:30:696,2.51,74,23,0.0,27.8,0.0,No Defect


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 827534 entries, 1 to 827534
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Employee Code       827534 non-null  object
 1   Machine             827534 non-null  object
 2   Production          549354 non-null  object
 3   Order Operation No  827534 non-null  object
 4   Date                827534 non-null  object
 5   Time                827534 non-null  object
 6   Current             827531 non-null  object
 7   Humidity            827534 non-null  object
 8   Temperature         827534 non-null  object
 9   Flow                827534 non-null  object
 10  Job Temp            827534 non-null  object
 11  Voltage             827534 non-null  object
 12  Defect              827534 non-null  object
dtypes: object(13)
memory usage: 88.4+ MB


In [None]:
data.isnull().sum()

Employee Code              0
Machine                    0
Production            278180
Order Operation No         0
Date                       0
Time                       0
Current                    3
Humidity                   0
Temperature                0
Flow                       0
Job Temp                   0
Voltage                    0
Defect                     0
dtype: int64

In [None]:
cols = ['Employee Code', 'Production', 'Order Operation No', 'Defect', 'Machine']

for col in cols:
  print(col, data[col].nunique())


Employee Code 14
Production 20
Order Operation No 12
Defect 52
Machine 1


## **Data Cleaning**

**Defect**

In [None]:
def conv(x):
  if 'Defect' in x:
    return 'No Defect'
  elif 'Inclusion' in x:
    return 'Tungsten Inclusion'
  elif 'Porosity' in x:
    return 'Porosity'

data['Defect'] = data['Defect'].apply(conv)

**Employee Code**

In [None]:
data['Employee Code'] = data['Employee Code'].apply(lambda x : str(x))
data['Employee Code'].unique()

array(['382617', '391342', '97346', '0', '394965', '387268', '394365',
       '383138', '394269'], dtype=object)

In [None]:
data[data['Defect'] == 'Porosity']['Employee Code'].value_counts()

391342    350
394965    269
97346     187
394269    139
0          76
383138     46
382617     39
394365     29
387268      8
Name: Employee Code, dtype: int64

In [None]:
data[data['Defect'] == 'Tungsten Inclusion']['Employee Code'].value_counts()

391342    1503
394965    1250
97346      674
394269     674
382617     326
394365     153
387268      16
383138      10
0            4
Name: Employee Code, dtype: int64

In [None]:
data[data['Defect'] == 'No Defect']['Employee Code'].value_counts()

391342    255031
394965    173466
97346     116565
394269    114008
0          66740
382617     36400
383138     27239
394365     23740
387268      8592
Name: Employee Code, dtype: int64

**Dropping 'Machine' column as it is single valued**

In [None]:
data.drop(columns=['Machine'], inplace=True)

**Production**

In [None]:
data['Production'].unique()

array(['E15002965', 'E15002966', nan, '-', '15003062', 'E15003220',
       'E15002881', 'E10002436', 'E50006711', 'E15003200', 'E15003202',
       'E15003056', 'E15002963', 'E10002437', 'E15003219', 'E15003201',
       'Test coopan', '-15003217', -15003217, -150033027, 15003062],
      dtype=object)

In [None]:
data[data['Production'] == '-']['Defect'].unique()

array(['No Defect'], dtype=object)

Production '-' contributes to 'No Defect' only


In [None]:
d1 = data[data['Production'] == '-'].index
data.drop(index = d1, inplace=True)

In [None]:
def econv(x):
  x = str(x)
  if 'E' == x[0]:
    return x
  elif x == 'Test coopan' or x == '0':
    return x 
  else:
    if x[0] == '-':
      s = 'E'+x[1:]
      return str(s)
    else:
      s = 'E'+x
      return str(s)

data["Production"] = data['Production'].apply(econv)

In [None]:
data['Production'].isnull().sum()

0

**Order Operation No**

In [None]:
def conv(x):
  if x == '-240':
    return str(240)
  elif x == '180-':
    return str(180)
  else:
    return str(x)
data['Order'] = data['Order Operation No'].apply(conv)

In [None]:
data['Order'].nunique()

6

In [None]:
data.drop(columns='Order Operation No', inplace=True)
data.columns
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 826794 entries, 1 to 827534
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   Employee Code  826794 non-null  object
 1   Production     826794 non-null  object
 2   Date           826794 non-null  object
 3   Time           826794 non-null  object
 4   Current        826791 non-null  object
 5   Humidity       826794 non-null  object
 6   Temperature    826794 non-null  object
 7   Flow           826794 non-null  object
 8   Job Temp       826794 non-null  object
 9   Voltage        826794 non-null  object
 10  Defect         826794 non-null  object
 11  Order          826794 non-null  object
dtypes: object(12)
memory usage: 82.0+ MB


**change dtype**

In [None]:
cols = ['Current', 'Humidity', 'Temperature', 'Flow', 'Job Temp', 'Voltage']

for col in cols:
  data[col] = data[col].apply(lambda x : float(x))


In [None]:
data['Current'].fillna(data['Current'].mean(), inplace=True)

**Date**

In [None]:
def dconv(x):
  x = str(x).split(":")
  return x[0]

data['Time'] = data['Time'].apply(dconv)

In [None]:
def convd(x):
  return x[5:7]

data['Month'] = data['Date'].apply(convd)


In [None]:

def dconv(x):
  return x[8:10]

data["Day"] = data['Date'].apply(dconv)

In [None]:
def m(x):
  return x[5:7]
data['Month'] = data['Date'].apply(m)

In [None]:
data.head()

Unnamed: 0,Employee Code,Production,Date,Time,Current,Humidity,Temperature,Flow,Job Temp,Voltage,Defect,Order,Month,Day
1,382617,E15002965,2022-09-10 00:00:00,7,1.13,74.0,23.0,0.01,29.3,15.2,No Defect,240,9,10
2,382617,E15002965,2022-09-10 00:00:00,7,1.82,74.0,23.0,0.0,29.3,0.0,Tungsten Inclusion,240,9,10
3,382617,E15002965,2022-09-10 00:00:00,7,2.91,74.0,23.0,0.0,29.3,0.0,No Defect,240,9,10
4,382617,E15002965,2022-09-10 00:00:00,7,0.04,74.0,23.0,0.0,29.3,0.0,No Defect,240,9,10
5,382617,E15002965,2022-09-10 00:00:00,7,2.51,74.0,23.0,0.0,27.8,0.0,No Defect,240,9,10


In [None]:
l = ['Employee Code', 'Day','Month','Production', 'Time', 'Order', 'Current', 'Humidity','Temperature', 'Flow', 'Job Temp', 'Voltage', 'Defect']
data = data[l]

In [None]:
cols = ['Employee Code', 'Day', 'Month', 'Production', 'Order']

for col in cols:
  print(col)
  print(data[col].unique())
  print(data[col].nunique())

Employee Code
['382617' '391342' '97346' '0' '394965' '387268' '394365' '383138'
 '394269']
9
Day
['10' '12' '13' '14' '15' '16' '17' '19' '30' '01' '02' '03' '05' '06'
 '07' '08' '09' '22' '23' '24' '25' '26' '27' '29']
24
Month
['09' '08']
2
Production
['E15002965' 'E15002966' 'Enan' 'E15003062' 'E15003220' 'E15002881'
 'E10002436' 'E50006711' 'E15003200' 'E15003202' 'E15003056' 'E15002963'
 'E10002437' 'E15003219' 'E15003201' 'Test coopan' 'E15003217'
 'E150033027']
18
Order
['240' '130' '180' '40' '30' '1']
6


In [None]:

data.to_csv('finalcleaned.csv')

**Model**

In [None]:
df = data.copy()
df.head()

Unnamed: 0,Employee Code,Day,Month,Production,Time,Order,Current,Humidity,Temperature,Flow,Job Temp,Voltage,Defect
1,382617,10,9,E15002965,7,240,1.13,74.0,23.0,0.01,29.3,15.2,No Defect
2,382617,10,9,E15002965,7,240,1.82,74.0,23.0,0.0,29.3,0.0,Tungsten Inclusion
3,382617,10,9,E15002965,7,240,2.91,74.0,23.0,0.0,29.3,0.0,No Defect
4,382617,10,9,E15002965,7,240,0.04,74.0,23.0,0.0,29.3,0.0,No Defect
5,382617,10,9,E15002965,7,240,2.51,74.0,23.0,0.0,27.8,0.0,No Defect


In [None]:
def time(x):
  if int(x) < 7 or int(x) ==23:
    return 0
  else:
    return 1

df['Time'] = df['Time'].apply(time)

In [None]:
cat_col = ['Employee Code', 'Day', 'Month', 'Production', 'Order']

for col in cat_col:
  df[col] = df[col].astype('category').cat.codes



In [None]:
LE=LabelEncoder()
LE.fit_transform(df['Defect'])

array([0, 2, 0, ..., 0, 0, 0])

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 826794 entries, 1 to 827534
Data columns (total 13 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Employee Code  826794 non-null  int8   
 1   Day            826794 non-null  int8   
 2   Month          826794 non-null  int8   
 3   Production     826794 non-null  int8   
 4   Time           826794 non-null  int64  
 5   Order          826794 non-null  int8   
 6   Current        826794 non-null  float64
 7   Humidity       826794 non-null  float64
 8   Temperature    826794 non-null  float64
 9   Flow           826794 non-null  float64
 10  Job Temp       826794 non-null  float64
 11  Voltage        826794 non-null  float64
 12  Defect         826794 non-null  object 
dtypes: float64(6), int64(1), int8(5), object(1)
memory usage: 60.7+ MB


In [None]:
scaler = StandardScaler()
df[['Current', 'Humidity', 'Temperature', 'Flow', 'Job Temp', 'Voltage']] = scaler.fit_transform(df[['Current', 'Humidity', 'Temperature', 'Flow', 'Job Temp', 'Voltage']])


In [None]:
df['Defect'] = df['Defect'].astype('category').cat.codes

In [None]:
def get_features(threshold,X,Y):
  high_score_features = []
  feature_scores = mutual_info_classif(X, Y, random_state=0)
  for score, f_name in sorted(zip(feature_scores, df.columns), reverse=True)[:threshold]:
          print(f_name, score)
          high_score_features.append(f_name)
  df_norm_mic = df[high_score_features]
  return df_norm_mic.columns

In [None]:
X = df.drop(columns='Defect', axis=1)
Y = df['Defect']

**7 Features**

In [None]:
features = get_features(7, X, Y)

Time 0.1657713820486576
Month 0.10447297148833401
Order 0.08282620346482006
Voltage 0.07729444434297972
Production 0.038352677883428665
Employee Code 0.03810009811269088
Flow 0.027001835760245174


In [None]:
sel_features = list(features)

sel_X = X[sel_features]

In [None]:
sampler = RandomOverSampler(random_state=42)
x_ros, y_ros = sampler.fit_resample(sel_X, Y)


In [None]:
ros_df = pd.concat([x_ros,y_ros], axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X = ros_df.drop(columns='Defect', axis=1)
Y = ros_df['Defect']
Xt,x,Yt,y = train_test_split(X,Y)

In [None]:
model = RandomForestClassifier()
model.fit(Xt,Yt)

RandomForestClassifier()

In [None]:
preds = model.predict(x)
print(classification_report(y,preds))

              precision    recall  f1-score   support

           0       0.89      0.72      0.79    205304
           1       0.77      0.93      0.84    205036
           2       0.99      0.96      0.98    205441

    accuracy                           0.87    615781
   macro avg       0.88      0.87      0.87    615781
weighted avg       0.88      0.87      0.87    615781



**8 features**

In [None]:
X = df.drop(columns='Defect', axis=1)
Y = df['Defect']

In [None]:
features = get_features(8, X, Y)

Time 0.1657713820486576
Month 0.10447297148833401
Order 0.08282620346482006
Voltage 0.07729444434297972
Production 0.038352677883428665
Employee Code 0.03810009811269088
Flow 0.027001835760245174
Day 0.010044938870972464


In [None]:
sel_features = list(features)
sampler = RandomOverSampler(random_state=42)
x_ros, y_ros = sampler.fit_resample(X[sel_features], Y)
ros_df = pd.concat([x_ros,y_ros], axis=1)

In [None]:
X = ros_df.drop(columns='Defect', axis=1)
Y = ros_df['Defect']
Xt,x,Yt,y = train_test_split(X,Y)

In [None]:
model = RandomForestClassifier()
model.fit(Xt,Yt)

RandomForestClassifier()

In [None]:
preds = model.predict(x)
print(classification_report(y,preds))

              precision    recall  f1-score   support

           0       0.91      0.73      0.81    205489
           1       0.78      0.94      0.85    205065
           2       0.99      0.97      0.98    205227

    accuracy                           0.88    615781
   macro avg       0.89      0.88      0.88    615781
weighted avg       0.89      0.88      0.88    615781



In [None]:
accuracy_score(y, preds)

0.8792720139140376

**9 Features**

In [None]:
X = df.drop(columns='Defect', axis=1)
Y = df['Defect']

features = get_features(9, X, Y)

Time 0.1657713820486576
Month 0.10447297148833401
Order 0.08282620346482006
Voltage 0.07729444434297972
Production 0.038352677883428665
Employee Code 0.03810009811269088
Flow 0.027001835760245174
Day 0.010044938870972464
Temperature 0.008722694403340281


In [None]:

sel_features = list(features)
sampler = RandomOverSampler(random_state=42)
x_ros, y_ros = sampler.fit_resample(X[sel_features], Y)
ros_df = pd.concat([x_ros,y_ros], axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X = ros_df.drop(columns='Defect', axis=1)
Y = ros_df['Defect']
Xt,x,Yt,y = train_test_split(X,Y)

In [None]:
model = RandomForestClassifier()
model.fit(Xt,Yt)

RandomForestClassifier()

In [None]:
preds = model.predict(x)
print(classification_report(y,preds))

              precision    recall  f1-score   support

           0       0.90      0.80      0.85    205597
           1       0.83      0.93      0.88    204682
           2       0.99      0.97      0.98    205502

    accuracy                           0.90    615781
   macro avg       0.91      0.90      0.90    615781
weighted avg       0.91      0.90      0.90    615781



In [None]:
accuracy_score(y,preds)

0.901817366888553

**12 Features**

In [None]:
X = df.drop(columns='Defect', axis=1)
Y = df['Defect']

features = get_features(12, X, Y)

Time 0.1657713820486576
Month 0.10447297148833401
Order 0.08282620346482006
Voltage 0.07729444434297972
Production 0.038352677883428665
Employee Code 0.03810009811269088
Flow 0.027001835760245174
Day 0.010044938870972464
Temperature 0.008722694403340281
Current 0.0044096511055148335
Humidity 0.0034654714536641595
Job Temp 0.0031161617940070885


In [None]:

sel_features = list(features)
sampler = RandomOverSampler(random_state=42)
x_ros, y_ros = sampler.fit_resample(X[sel_features], Y)
ros_df = pd.concat([x_ros,y_ros], axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X = ros_df.drop(columns='Defect', axis=1)
Y = ros_df['Defect']
Xt,x,Yt,y = train_test_split(X,Y)

In [None]:
model = RandomForestClassifier()
model.fit(Xt,Yt)

RandomForestClassifier()

In [None]:
preds = model.predict(x)
print(classification_report(y,preds))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    204900
           1       1.00      1.00      1.00    205349
           2       1.00      1.00      1.00    205532

    accuracy                           1.00    615781
   macro avg       1.00      1.00      1.00    615781
weighted avg       1.00      1.00      1.00    615781



In [None]:
accuracy_score(y,preds)

0.9993877693530655

In [None]:
from joblib import Parallel, delayed
import joblib

joblib.dump(model, 'model.pkl')


['model.pkl']

In [None]:
rfc = joblib.load('model.pkl')
rfc.predict(x)

array([2, 2, 2, ..., 1, 2, 2], dtype=int8)