In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor

In [2]:
historical_defects = pd.read_csv('data/historical_defects.csv')
inspection_sessions = pd.read_csv('data/inspection_sessions.csv')
defects = pd.read_csv('data/defects.csv')
train_defects = pd.read_csv('data/train_defects.csv', delimiter=',')

In [3]:
historical_defects.columns, inspection_sessions.columns, defects.columns, train_defects.columns

(Index(['inspection_id', 'defect_id', 'defect_sequence_no',
        'repeat_defect_detected'],
       dtype='object'),
 Index(['inspection_id', 'machine_id', 'eval_set', 'inspection_number',
        'inspection_dow', 'inspection_hour_of_day',
        'days_since_last_inspection'],
       dtype='object'),
 Index(['defect_id', 'defect_subcategory_id', 'defect_category_id'], dtype='object'),
 Index(['inspection_id', 'defect_id', 'defect_sequence_no',
        'repeat_defect_detected'],
       dtype='object'))

In [4]:
# We are looking for potential indicators of repeat defect

# ~features describing the time of defect detected/not
# inspection_hour_of_day (accuracy may be influenced by time of day, fatigue, lighting)
# inspection_dow day of week

# ~features describing the condition of machine and the specific defect
# defect_sequence_no (the sequence/order of specific defect) {defect_id,sequence_no} -->category?
# machine_id
# days since last inspection (condition may have deteriorated much if inspected a long time ago)
# inspection number (how many times has it been inspected)

# ~features describing the defect itself
# ?defect_id (doesn't the sub category and category combined imply the id?)
# defect_subcategory_id
# defect_category_id

In [5]:
train_data = train_defects.merge(defects, on="defect_id")
train_data = train_data.rename(columns = {'repeat_defect_detected':'label'})
train_data1 = train_data.merge(inspection_sessions, on='inspection_id')
features = train_data1.drop(['label','eval_set'], axis=1).columns
# features = train_data1.drop(['label','eval_set','inspection_id','defect_id'], axis=1).columns


In [6]:
test_data = inspection_sessions[inspection_sessions['eval_set']=='test'].merge(historical_defects, on='inspection_id')
test_data1 = test_data.merge(defects, on='defect_id')
test_data1.drop(['eval_set','repeat_defect_detected'], axis=1).columns, features

(Index(['inspection_id', 'machine_id', 'inspection_number', 'inspection_dow',
        'inspection_hour_of_day', 'days_since_last_inspection',
        'defect_sequence_no', 'defect_id', 'defect_subcategory_id',
        'defect_category_id'],
       dtype='object'),
 Index(['inspection_id', 'defect_id', 'defect_sequence_no',
        'defect_subcategory_id', 'defect_category_id', 'machine_id',
        'inspection_number', 'inspection_dow', 'inspection_hour_of_day',
        'days_since_last_inspection'],
       dtype='object'))

In [7]:
# we can consider dropping these two columns inspection_number: # of inspections on machine
train_data1.groupby(['inspection_id','defect_id']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,defect_sequence_no,label,defect_subcategory_id,defect_category_id,machine_id,eval_set,inspection_number,inspection_dow,inspection_hour_of_day,days_since_last_inspection
inspection_id,defect_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,10246,1,1,1,1,1,1,1,1,1,1
1,11109,1,1,1,1,1,1,1,1,1,1
1,13176,1,1,1,1,1,1,1,1,1,1
1,22035,1,1,1,1,1,1,1,1,1,1
1,43633,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...
3421063,35548,1,1,1,1,1,1,1,1,1,1
3421063,49235,1,1,1,1,1,1,1,1,1,1
3421070,4724,1,1,1,1,1,1,1,1,1,1
3421070,16953,1,1,1,1,1,1,1,1,1,1


In [8]:
x_train = train_data1[features].values
y_train = train_data1['label'].values

In [9]:
x_test = test_data1[features].values
y_test = test_data1['repeat_defect_detected'].values

In [None]:
rf_model = RandomForestRegressor(n_estimators=500, oob_score=True, random_state=100)
rf_model.fit(x_train, y_train)
pred_train = rf_model.predict(x_train)

print(np.sqrt(mean_squared_error(y_train,pred_train)))
print(r2_score(y_train, pred_train))

pred_test = rf_model.predict(x_test)
print(np.sqrt(mean_squared_error(y_test)))
print(r2_score(x_test, pred_test))

