In [1]:
import pandas as pd
df = pd.read_csv('../data/train.csv')
df.shape

(115475, 13)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28248 entries, 0 to 28247
Data columns (total 13 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   case_id                         28248 non-null  object
 1   equipment_id                    28248 non-null  object
 2   completion_date                 28248 non-null  object
 3   action_recommendation_id        28248 non-null  object
 4   action_recommendation_type      28248 non-null  object
 5   action_recommendation_category  28248 non-null  object
 6   equipment_area                  28248 non-null  object
 7   usage_type                      28248 non-null  object
 8   speed_category                  28248 non-null  int64 
 9   load_category                   28248 non-null  int64 
 10  floors_category                 28248 non-null  int64 
 11  equipment_category              28248 non-null  object
 12  feedback                        28248 non-null

In [149]:
df['completion_date']

0        2018-10-03
1        2018-10-03
2        2018-10-04
3        2018-10-04
4        2018-10-04
            ...    
28243    2019-08-22
28244    2019-01-02
28245    2019-07-29
28246    2019-11-28
28247    2019-05-30
Name: completion_date, Length: 28248, dtype: object

In [2]:
# Explore feature dofs
for colname in df.columns:
    print(colname, df[colname].unique().size)

case_id 73945
equipment_id 30000
completion_date 466
action_recommendation_id 295
action_recommendation_type 3
action_recommendation_category 6
equipment_area 1285
usage_type 13
speed_category 8
load_category 8
floors_category 8
equipment_category 20
feedback 2


In [3]:
# Explore label imbalance
negative, positive = df.groupby('feedback')['feedback'].count()
positive/(negative+positive)

0.8776878112145486

In [4]:
# Employ under sampling to balance out labels
from imblearn.under_sampling import RandomUnderSampler
sampler = RandomUnderSampler(sampling_strategy='not minority', random_state=1)
df, df_balanced_labels = sampler.fit_resample(df, df['feedback'])
df.shape, df_balanced_labels.shape

((28248, 13), (28248,))

In [132]:
# Collect features, labels
x = df[[
    'action_recommendation_id',
    'action_recommendation_type',
    'action_recommendation_category',
    'equipment_area',
    'equipment_category',
    'floors_category',
    'load_category',
    'speed_category',
    'usage_type',
]]
y = df['feedback']
x.shape, y.shape

((28248, 9), (28248,))

In [133]:
# Employ label encoder to convert categories to numeric
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
label_encoder = defaultdict(LabelEncoder)
label_encoder

defaultdict(sklearn.preprocessing._label.LabelEncoder, {})

In [134]:
# Apply label encoder
x = x.apply(lambda feature: label_encoder[feature.name].fit_transform(feature))
x.shape

(28248, 9)

In [135]:
# Apply train, test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((25423, 9), (2825, 9), (25423,), (2825,))

In [163]:
# Compute baseline with logistic regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=0, max_iter=250).fit(x_train, y_train)
lr.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6010619469026549

In [139]:
# Compute baseline with lightgbm
import lightgbm as lgb
clf = lgb.LGBMClassifier()
clf.fit(x_train, y_train)

LGBMClassifier()

In [140]:
clf.score(x_test, y_test)

0.744070796460177

In [141]:
from sklearn.metrics import confusion_matrix, classification_report

In [142]:
print(confusion_matrix(y_test, clf.predict(x_test)))

[[ 997  395]
 [ 328 1105]]


In [143]:
print(classification_report(y_test, clf.predict(x_test)))

              precision    recall  f1-score   support

           0       0.75      0.72      0.73      1392
           1       0.74      0.77      0.75      1433

    accuracy                           0.74      2825
   macro avg       0.74      0.74      0.74      2825
weighted avg       0.74      0.74      0.74      2825



In [144]:
from catboost import CatBoostClassifier
clf2 = CatBoostClassifier(n_estimators=200, depth=5)
clf2.fit(x_train, y_train, verbose=True)

Learning rate set to 0.179441
0:	learn: 0.6774661	total: 4.44ms	remaining: 884ms
1:	learn: 0.6663638	total: 7.75ms	remaining: 768ms
2:	learn: 0.6579575	total: 10.9ms	remaining: 718ms
3:	learn: 0.6527890	total: 13.6ms	remaining: 665ms
4:	learn: 0.6473546	total: 16.4ms	remaining: 640ms
5:	learn: 0.6440768	total: 19.2ms	remaining: 620ms
6:	learn: 0.6413050	total: 22.1ms	remaining: 610ms
7:	learn: 0.6392073	total: 25.1ms	remaining: 604ms
8:	learn: 0.6373419	total: 28ms	remaining: 595ms
9:	learn: 0.6343463	total: 31.9ms	remaining: 607ms
10:	learn: 0.6320788	total: 34.9ms	remaining: 600ms
11:	learn: 0.6305503	total: 38ms	remaining: 595ms
12:	learn: 0.6286103	total: 41.3ms	remaining: 594ms
13:	learn: 0.6271832	total: 44.3ms	remaining: 588ms
14:	learn: 0.6262656	total: 47.4ms	remaining: 585ms
15:	learn: 0.6249544	total: 50.8ms	remaining: 584ms
16:	learn: 0.6233045	total: 53.2ms	remaining: 572ms
17:	learn: 0.6223615	total: 55.6ms	remaining: 563ms
18:	learn: 0.6210990	total: 58ms	remaining: 552m

<catboost.core.CatBoostClassifier at 0x12a42da00>

In [145]:
clf2.score(x_test, y_test)

0.7447787610619468

In [160]:
%%time
from sklearn.ensemble import RandomForestClassifier
clf3 = RandomForestClassifier(
    criterion='entropy',
    n_estimators=500,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=0
)
clf3.fit(x_train, y_train)

CPU times: user 8.99 s, sys: 132 ms, total: 9.12 s
Wall time: 9.13 s


RandomForestClassifier(criterion='entropy', n_estimators=500, random_state=0)

In [161]:
clf3.score(x_test, y_test)

0.7787610619469026

In [162]:
print(classification_report(y_test, clf3.predict(x_test)))

              precision    recall  f1-score   support

           0       0.78      0.77      0.77      1392
           1       0.78      0.79      0.78      1433

    accuracy                           0.78      2825
   macro avg       0.78      0.78      0.78      2825
weighted avg       0.78      0.78      0.78      2825



In [93]:
from sklearn.neural_network import MLPClassifier
clf4 = MLPClassifier()
clf4.fit(x_train, y_train)

MLPClassifier()

In [94]:
clf4.score(x_test, y_test)

0.6138053097345133

In [95]:
from sklearn.naive_bayes import GaussianNB
clf5 = GaussianNB()
clf5.fit(x_train, y_train)
clf5.score(x_test, y_test)

0.5961061946902655

In [96]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
clf6 = QuadraticDiscriminantAnalysis()
clf6.fit(x_train, y_train)
clf6.score(x_test, y_test)

0.6095575221238938

In [98]:
from sklearn.svm import SVC
clf = SVC()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

0.5844247787610619

In [106]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=1)
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

0.6697345132743363