In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [5]:
historical_defects = pd.read_csv('data\historical_defects.csv')
inspection_sessions = pd.read_csv('data\inspection_sessions.csv')
defects = pd.read_csv('data\defects.csv')

In [14]:
train_defects = pd.read_csv('data/train_defects.csv', delimiter=',')

In [16]:
train_defects

Unnamed: 0,inspection_id,defect_id,defect_sequence_no,repeat_defect_detected
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1
...,...,...,...,...
1384612,3421063,14233,3,1
1384613,3421063,35548,4,1
1384614,3421070,35951,1,1
1384615,3421070,16953,2,1


In [22]:
train_data = train_defects.merge(defects, on="defect_id")
train_data

Unnamed: 0,inspection_id,defect_id,defect_sequence_no,repeat_defect_detected,defect_subcategory_id,defect_category_id
0,1,49302,1,1,120,16
1,816049,49302,7,1,120,16
2,1242203,49302,1,1,120,16
3,1383349,49302,11,1,120,16
4,1787378,49302,8,0,120,16
...,...,...,...,...,...,...
1384612,3420011,1528,12,0,97,13
1384613,3420084,47935,20,0,73,11
1384614,3420084,9491,21,0,25,11
1384615,3420088,16380,12,0,97,13


In [24]:
train_data = train_data.rename(columns = {'repeat_defect_detected':'label'})
train_data['label'].replace(1,'yes', inplace=True)
train_data['label'].replace(0,'no', inplace=True)


In [32]:
features = train_data.drop('label', axis=1).columns

In [41]:
# scale the data
x = train_data.loc[:, features].values
x = StandardScaler().fit_transform(x)
np.mean(x), np.std(x)

(6.142429516576855e-17, 1.0)

In [42]:
train_data_scaled = pd.DataFrame(x, columns=features)

In [55]:
pca_data = PCA(n_components=2)
pca_features = pca_data.fit_transform(x)
print('explained variation per principal component {}'.format(pca_data.explained_variance_ratio_))

explained variation per principal component [0.21530988 0.2007834 ]


In [56]:
pca_train_df = pd.DataFrame(data = pca_features, columns=['pc1','pc2'])
pca_train_df['label'] = train_data['label']

In [67]:
px.scatter(pca_train_df[:1000], x='pc1', y='pc2', color='label')

In [59]:
pca_train_df

Unnamed: 0,pc1,pc2,label
0,0.643112,0.724003,yes
1,0.926531,1.243473,yes
2,0.677908,0.835741,yes
3,1.116128,1.591879,yes
4,0.997166,1.405190,no
...,...,...,...
1384612,1.359172,-0.958055,no
1384613,0.299173,2.281634,no
1384614,0.254641,-0.011909,no
1384615,1.100028,-0.096169,no


In [25]:
train_data

Unnamed: 0,inspection_id,defect_id,defect_sequence_no,label,defect_subcategory_id,defect_category_id
0,1,49302,1,yes,120,16
1,816049,49302,7,yes,120,16
2,1242203,49302,1,yes,120,16
3,1383349,49302,11,yes,120,16
4,1787378,49302,8,no,120,16
...,...,...,...,...,...,...
1384612,3420011,1528,12,no,97,13
1384613,3420084,47935,20,no,73,11
1384614,3420084,9491,21,no,25,11
1384615,3420088,16380,12,no,97,13


In [17]:
defects

Unnamed: 0,defect_id,defect_subcategory_id,defect_category_id
0,1,61,19
1,2,104,13
2,3,94,7
3,4,38,1
4,5,5,13
...,...,...,...
49683,49684,124,5
49684,49685,42,1
49685,49686,112,3
49686,49687,41,8


In [21]:
historical_defects.groupby('defect_id')['repeat_defect_detected'].apply(list)

defect_id
1        [0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, ...
2        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3        [1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...
4        [0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, ...
5            [1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0]
                               ...                        
49684                          [0, 0, 1, 0, 0, 0, 0, 0, 0]
49685    [0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
49686    [1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, ...
49687              [1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1]
49688    [1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, ...
Name: repeat_defect_detected, Length: 49677, dtype: object

In [19]:
historical_defects.columns

Index(['inspection_id', 'defect_id', 'defect_sequence_no',
       'repeat_defect_detected'],
      dtype='object')