In [8]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as RF

from data_reader import read_data
from data_preprocessing import preprocess_data
from feature_extractor import extract_features

%matplotlib inline

In [3]:
data = read_data()
data = preprocess_data(data)

In [4]:
data.groupby('ResultClass').size()

ResultClass
0    5167766
1    2205343
2      73431
dtype: int64

In [5]:
data_head = data.head(50000)
features = extract_features(data_head)

In [23]:
features.fillna(0, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(features.drop('ResultClass', axis=1).values,
                            features.ResultClass.values,
                            test_size=0.3,
                            random_state=123456)

In [25]:
rf = RF(n_estimators=100, max_features='auto', n_jobs=-1)
rf = rf.fit(X_train, y_train)
predicted = rf.predict(X_test)
predicted

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [27]:
len([x == y for x, y in zip(y_test, predicted)])

15180

In [28]:
result = pd.DataFrame(y_test, columns=['test'])
result['pred'] = predicted

In [29]:
result.head()

Unnamed: 0,test,pred
0,1.0,0.0
1,0.0,0.0
2,0.0,0.0
3,1.0,0.0
4,0.0,0.0


In [30]:
result['is_same'] = result.apply(lambda row: int(row[0] == row[1]), axis=1)

In [32]:
result.head()

Unnamed: 0,test,pred,is_same
0,1.0,0.0,0
1,0.0,0.0,1
2,0.0,0.0,1
3,1.0,0.0,0
4,0.0,0.0,1


In [64]:
accuracy = sum(result.is_same) / result.shape[0]
accuracy

0.64361001317523059

In [34]:
result.groupby('test').agg({'is_same': ['sum', 'size', 'mean']})['is_same'].sort_values('mean', ascending=False)

Unnamed: 0_level_0,sum,size,mean
test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,8142,10040,0.810956
1.0,1620,4930,0.3286
2.0,8,210,0.038095


In [38]:
table = pd.crosstab(predicted, y_test, colnames=['Actual Results'], rownames=['Predicted Results'])
table

Actual Results,0.0,1.0,2.0
Predicted Results,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,8142,3286,157
1.0,1848,1620,45
2.0,50,24,8


In [62]:
recall = {}
precision = {}
for column in table.columns:
    recall[column] = table[column][column] / table.apply(lambda row: sum(row), axis=0)[column]
    precision[column] = table[column][column] / table.apply(lambda row: sum(row), axis=1)[column]
recall

{0.0: 0.81095617529880482, 1.0: 0.32860040567951321, 2.0: 0.038095238095238099}

In [63]:
precision

{0.0: 0.70280535174794989, 1.0: 0.46114432109308284, 2.0: 0.097560975609756101}

In [67]:
F = {}
for column in table.columns:
    F[column] = 2 * (recall[column] * precision[column]) / (recall[column] + precision[column])
F

{0.0: 0.75301734104046236, 1.0: 0.38374985194835959, 2.0: 0.054794520547945209}