In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
historical_defects = pd.read_csv('data/historical_defects.csv')
inspection_sessions = pd.read_csv('data/inspection_sessions.csv')
defects = pd.read_csv('data/defects.csv')
train_defects = pd.read_csv('data/train_defects.csv', delimiter=',')

In [3]:
historical_defects.columns, inspection_sessions.columns, defects.columns, train_defects.columns

(Index(['inspection_id', 'defect_id', 'defect_sequence_no',
        'repeat_defect_detected'],
       dtype='object'),
 Index(['inspection_id', 'machine_id', 'eval_set', 'inspection_number',
        'inspection_dow', 'inspection_hour_of_day',
        'days_since_last_inspection'],
       dtype='object'),
 Index(['defect_id', 'defect_subcategory_id', 'defect_category_id'], dtype='object'),
 Index(['inspection_id', 'defect_id', 'defect_sequence_no',
        'repeat_defect_detected'],
       dtype='object'))

In [4]:
# We are looking for potential indicators of repeat defect

# ~features describing the time of defect detected/not
# inspection_hour_of_day (accuracy may be influenced by time of day, fatigue, lighting)
# inspection_dow day of week

# ~features describing the condition of machine and the specific defect
# defect_sequence_no (the sequence/order of specific defect) {defect_id,sequence_no} -->category?
# machine_id
# days since last inspection (condition may have deteriorated much if inspected a long time ago)
# inspection number (how many times has it been inspected)

# ~features describing the defect itself
# not available in test set
# ?defect_id (doesn't the sub category and category combined imply the id?)
# defect_subcategory_id
# defect_category_id

In [5]:
train_data = train_defects.merge(defects, on="defect_id")
train_data = train_data.rename(columns = {'repeat_defect_detected':'label'})
train_data1 = train_data.merge(inspection_sessions, on='inspection_id')
# features = train_data1.drop(['label','eval_set','inspection_id','defect_id'], axis=1).columns

test_data = inspection_sessions[inspection_sessions['eval_set']=='test']
features = test_data.drop(['eval_set'], axis=1).columns

In [6]:
x_train = train_data1.loc[:, features].values
x_train = StandardScaler().fit_transform(x_train)
# x_train = train_data1[features].values
y_train = train_data1['label'].values

x_test = test_data.loc[:, features].values
x_test = StandardScaler().fit_transform(x_test)
# x_test = test_data[features].values

In [7]:
rf_model = RandomForestClassifier(n_estimators=100, max_depth=4, random_state=0)
rf_model.fit(x_train, y_train)
pred_train = rf_model.predict(x_train)

print(np.sqrt(mean_squared_error(y_train,pred_train)))
print(r2_score(y_train, pred_train))

pred_test = rf_model.predict(x_test)


0.6081430859318905
-0.5392014897163713


In [8]:
test_data['pred_repeat'] = pred_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['pred_repeat'] = pred_test


In [9]:
pre_machines = test_data[test_data['pred_repeat']==1][['inspection_id','machine_id']]

# Get likely defects per machine

In [10]:
unique_defect_machine = historical_defects[['inspection_id','defect_id']].drop_duplicates().merge(inspection_sessions[['inspection_id','machine_id']].drop_duplicates(), on="inspection_id")


In [11]:
def getlikelydefect(x):
    x_dict = {}
    total = len(x)
    for item in x:
        if item in x_dict.keys():
            continue
        x_dict[item] = x.count(item)/total
    return max(x_dict, key=x_dict.get)

getlikelydefect([1,2,2,3,4])


2

In [12]:
defects_in_machines = unique_defect_machine.groupby('machine_id')['defect_id'].apply(list).reset_index()
defects_in_machines['likelidefect'] = defects_in_machines['defect_id'].apply(lambda x: getlikelydefect(x))


In [13]:
pred_defects = pre_machines.merge(defects_in_machines, on='machine_id')

In [14]:
list_defects_df = pred_defects.groupby('inspection_id')['defect_id'].apply(list).reset_index()

# Formatting for output

In [15]:
def getdefectstr(x):
    string_x = ''
    for defect in x:
        string_x = string_x + str(defect) + ' '
    string_x = string_x.strip()
    return string_x
getdefectstr([1,2,3,4])

'1 2 3 4'

In [16]:
list_defects_df['defect_id_str'] = list_defects_df['defect_id'].apply(lambda x: getdefectstr(x[0]))

In [17]:
def getrowstr(x):
    str_x = "| " + str(x['inspection_id']) + " | " + str(x['defect_id_str']) + "\t|" 
    return str_x

In [18]:
list_defects_df['row_str'] = list_defects_df.apply(lambda x: getrowstr(x), axis=1)
list_defects_df = list_defects_df.rename(columns={"row_str": "| inspection_id    | defects |"})

In [19]:
list_defects_df.loc[-1] = ["-","- ","-","| ------- | ------- |"]
list_defects_df.index = list_defects_df.index + 1
list_defects_df.sort_index(inplace=True)

In [20]:
list_defects_df[["| inspection_id    | defects |"]].to_csv('rfoutput.txt', sep='|', index=None)