In [26]:
import numpy as np
import pandas as pd

In [27]:
train_data = pd.read_csv('./data/output/Enron_train_data_clean.csv')
test_features = pd.read_csv('./data/output/Enron_test_features_clean.csv')
train_data_y = train_data['poi']
train_data = train_data.drop(['poi'],axis=1)
print('train_data shape: ', train_data.shape)
print('test_features: ', test_features.shape)

train_data shape:  (113, 19)
test_features:  (33, 19)


# Accuracy

In [28]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")


x_train,x_valid,y_train,y_valid = train_test_split(train_data,train_data_y,test_size=0.2,random_state=42)


lg = LogisticRegression()
svm = SVC(probability= True)
nb = GaussianNB()
dt = DecisionTreeClassifier(random_state = 42)
rf = RandomForestClassifier(random_state = 100)
ada = AdaBoostClassifier()
per =  Perceptron()


lg.fit(x_train,y_train)
y_pred_lg = lg.predict(x_valid)
print(f'valid score of logistic:{accuracy_score(y_valid,y_pred_lg)}')
svm.fit(x_train,y_train)
y_pred_svm = svm.predict(x_valid)
print(f'valid score of SVM:{accuracy_score(y_valid,y_pred_svm)}')
nb.fit(x_train,y_train)
y_pred_nb = nb.predict(x_valid)
print(f'valid score of naive bayes:{accuracy_score(y_valid,y_pred_nb)}')
dt.fit(x_train,y_train)
y_pred_dt = dt.predict(x_valid)
print(f'valid score of DecisionTree:{accuracy_score(y_valid,y_pred_dt)}')
rf.fit(x_train,y_train)
y_pred_rf = rf.predict(x_valid)
print(f'valid score of RandomForest:{accuracy_score(y_valid,y_pred_rf)}')
ada.fit(x_train,y_train)
y_pred_ada = ada.predict(x_valid)
print(f'valid score of AdaBoost:{accuracy_score(y_valid,y_pred_ada)}')
per.fit(x_train,y_train)
y_pred_per = per.predict(x_valid)
print(f'valid score of Perceptron:{accuracy_score(y_valid,y_pred_per)}')

valid score of logistic:0.782608695652174
valid score of SVM:0.782608695652174
valid score of naive bayes:0.7391304347826086
valid score of DecisionTree:0.8260869565217391
valid score of RandomForest:0.8260869565217391
valid score of AdaBoost:0.8260869565217391
valid score of Perceptron:0.782608695652174


# Recall & Precision

In [33]:
from sklearn.metrics import confusion_matrix
confusion_matrix([0, 1, 0, 1, 1, 1], [1, 1, 1, 0, 1, 1])

array([[0, 2],
       [1, 3]], dtype=int64)

### confusion matrix in sklearn 
| TN | FP |
|----|----|
| FN | TP |

In [30]:
print(f'Label:\n{y_valid.groupby(by = lambda x: y_valid[x]).count()}\n'+'-'*100)
print(f'confusion matrix of logistic:\n{confusion_matrix(y_valid,y_pred_lg)}\n'+'-'*100)
print(f'confusion matrix of SVM:\n{confusion_matrix(y_valid,y_pred_svm)}\n'+'-'*100)
print(f'confusion matrix of naive bayes:\n{confusion_matrix(y_valid,y_pred_nb)}\n'+'-'*100)
print(f'confusion matrix of DecisionTree:\n{confusion_matrix(y_valid,y_pred_dt)}\n'+'-'*100)
print(f'confusion matrix of RandomForest:\n{confusion_matrix(y_valid,y_pred_rf)}\n'+'-'*100)
print(f'confusion matrix of AdaBoost:\n{confusion_matrix(y_valid,y_pred_ada)}\n'+'-'*100)
print(f'confusion matrix of Perceptron:\n{confusion_matrix(y_valid,y_pred_per)}\n'+'-'*100)

Label:
False    18
True      5
Name: poi, dtype: int64
----------------------------------------------------------------------------------------------------
confusion matrix of logistic:
[[17  1]
 [ 4  1]]
----------------------------------------------------------------------------------------------------
confusion matrix of SVM:
[[18  0]
 [ 5  0]]
----------------------------------------------------------------------------------------------------
confusion matrix of naive bayes:
[[17  1]
 [ 5  0]]
----------------------------------------------------------------------------------------------------
confusion matrix of DecisionTree:
[[17  1]
 [ 3  2]]
----------------------------------------------------------------------------------------------------
confusion matrix of RandomForest:
[[18  0]
 [ 4  1]]
----------------------------------------------------------------------------------------------------
confusion matrix of AdaBoost:
[[16  2]
 [ 2  3]]
---------------------------------------

In [31]:
from sklearn.metrics import precision_score

print(f'valid precision score of logistic:{precision_score(y_valid,y_pred_lg)}')
print(f'valid precision score of SVM:{precision_score(y_valid,y_pred_svm, zero_division=1)}') # 輸入的數據有positive，但是預測的結果沒有positive⇒此時可以定義Precision為1，因為可以當作False Positive沒有發生
print(f'valid precision score of naive bayes:{precision_score(y_valid,y_pred_nb)}')
print(f'valid precision score of DecisionTree:{precision_score(y_valid,y_pred_dt)}')
print(f'valid precision score of RandomForest:{precision_score(y_valid,y_pred_rf)}')
print(f'valid precision score of AdaBoost:{precision_score(y_valid,y_pred_ada)}')
print(f'valid precision score of Perceptron:{precision_score(y_valid,y_pred_per, zero_division=1)}') # 輸入的數據有positive，但是預測的結果沒有positive⇒此時可以定義Precision為1，因為可以當作False Positive沒有發生

valid precision score of logistic:0.5
valid precision score of SVM:1.0
valid precision score of naive bayes:0.0
valid precision score of DecisionTree:0.6666666666666666
valid precision score of RandomForest:1.0
valid precision score of AdaBoost:0.6
valid precision score of Perceptron:1.0


In [32]:
from sklearn.metrics import recall_score
print(f'valid recall score of logistic:{recall_score(y_valid,y_pred_lg)}')
print(f'valid recall score of SVM:{recall_score(y_valid,y_pred_svm)}')
print(f'valid recall score of naive bayes:{recall_score(y_valid,y_pred_nb)}')
print(f'valid recall score of DecisionTree:{recall_score(y_valid,y_pred_dt)}')
print(f'valid recall score of RandomForest:{recall_score(y_valid,y_pred_rf)}')
print(f'valid recall score of AdaBoost:{recall_score(y_valid,y_pred_ada)}')
print(f'valid recall score of Perceptron:{recall_score(y_valid,y_pred_per)}')

valid recall score of logistic:0.2
valid recall score of SVM:0.0
valid recall score of naive bayes:0.0
valid recall score of DecisionTree:0.4
valid recall score of RandomForest:0.2
valid recall score of AdaBoost:0.6
valid recall score of Perceptron:0.0


# 結論
1. naive bayes容易濫捕無辜 (Precision score 最低)
2. SVM 、naive bayes 、 Perceptron容易放縱壞人(recall score 最低)

# 補充 AUC

In [None]:
from sklearn.metrics import roc_auc_score
lg.fit(x_train,y_train)
y_prob = lg.predict_proba(x_valid)
print(f'auc score of logistic:{roc_auc_score(y_valid,y_prob[:,1])}')
svm.fit(x_train,y_train)
y_prob = svm.predict_proba(x_valid)
print(f'auc score of SVM:{roc_auc_score(y_valid,y_prob[:,1])}')
nb.fit(x_train,y_train)
y_prob = nb.predict_proba(x_valid)
print(f'auc score of naive bayes:{roc_auc_score(y_valid,y_prob[:,1])}')
dt.fit(x_train,y_train)
y_prob = dt.predict_proba(x_valid)
print(f'auc score of DecisionTree:{roc_auc_score(y_valid,y_prob[:,1])}')
rf.fit(x_train,y_train)
y_prob = rf.predict_proba(x_valid)
print(f'auc score of RandomForest:{roc_auc_score(y_valid,y_prob[:,1])}')
ada.fit(x_train,y_train)
y_prob = ada.predict_proba(x_valid)
print(f'auc score of AdaBoost:{roc_auc_score(y_valid,y_prob[:,1])}')

# Preprocess

In [None]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler

train_data = pd.read_csv('./data/Enron_Fraud/Enron_train_data.csv')
test_features = pd.read_csv('./data/Enron_Fraud/Enron_test_features.csv')
features = test_features.columns.tolist()
train_data_y = train_data['poi']
train_data = train_data.drop(['poi'],axis=1)
pre_process_data  = pd.concat([train_data,test_features])
pre_process_data = pre_process_data.drop(['name','email_address'],axis = 1)
features = [i for i in features if i not in ['name','email_address']]
pre_process_data = pre_process_data.fillna(0)
fin_features = ['bonus', 'deferral_payments', 'deferred_income', 'director_fees', 'exercised_stock_options', 'expenses',
                'loan_advances', 'long_term_incentive', 'other', 'restricted_stock', 'restricted_stock_deferred', 'salary',
                'total_payments', 'total_stock_value']
email_features = ['from_messages', 'from_poi_to_this_person', 'from_this_person_to_poi', 'shared_receipt_with_poi',
                  'to_messages']
train_data = pre_process_data[:len(train_data_y)]
test_data = pre_process_data[len(train_data_y):]
mms = MinMaxScaler()
mms_ = mms.fit(train_data.loc[:,fin_features])
train_data_fin = mms_.transform(train_data.loc[:,fin_features])
test_data_fin = mms_.transform(test_data.loc[:,fin_features])
train_data_fin = pd.DataFrame(train_data_fin, columns = fin_features)
test_data_fin = pd.DataFrame(test_data_fin, columns = fin_features)
train_data = pd.concat([train_data_fin,train_data[email_features]], axis = 1)
train_data_clean = pd.concat([train_data,train_data_y],axis = 1)
test_data_clean = pd.concat([test_data_fin,test_data[email_features]], axis = 1)
train_data_clean.to_csv('./data/output/Enron_train_data_clean.csv', index = False)
test_data_clean.to_csv('./data/output/Enron_test_features_clean.csv', index = False)