# 引入資料

In [12]:
import pandas as pd
import numpy as np

#前處理: Normalization::MinMaxScaler 最大最小化 0~1
#前處理: standardization::StandardScaler -1~1 平均0
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.metrics import accuracy_score

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

import warnings
warnings.filterwarnings('ignore')

In [13]:
data_path = 'data/'
df_train = pd.read_csv(data_path + 'train_data.csv')
df_test = pd.read_csv(data_path + 'test_features.csv')
df_train.head()

Unnamed: 0,name,bonus,deferral_payments,deferred_income,director_fees,email_address,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,...,long_term_incentive,other,poi,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
0,RICE KENNETH D,1750000.0,,-3504386.0,,ken.rice@enron.com,19794175.0,46950.0,18.0,42.0,...,1617011.0,174839.0,True,2748364.0,,420636.0,864.0,905.0,505050.0,22542539.0
1,SKILLING JEFFREY K,5600000.0,,,,jeff.skilling@enron.com,19250000.0,29336.0,108.0,88.0,...,1920000.0,22122.0,True,6843672.0,,1111258.0,2042.0,3627.0,8682716.0,26093672.0
2,SHELBY REX,200000.0,,-4167.0,,rex.shelby@enron.com,1624396.0,22884.0,39.0,13.0,...,,1573324.0,True,869220.0,,211844.0,91.0,225.0,2003885.0,2493616.0
3,KOPPER MICHAEL J,800000.0,,,,michael.kopper@enron.com,,118134.0,,,...,602671.0,907502.0,True,985032.0,,224305.0,,,2652612.0,985032.0
4,CALGER CHRISTOPHER F,1250000.0,,-262500.0,,christopher.calger@enron.com,,35818.0,144.0,199.0,...,375304.0,486.0,True,126027.0,,240189.0,2188.0,2598.0,1639297.0,126027.0


In [14]:
df_train.describe()

Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,loan_advances,long_term_incentive,other,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
count,61.0,28.0,34.0,13.0,81.0,73.0,65.0,65.0,65.0,2.0,49.0,69.0,82.0,10.0,73.0,65.0,65.0,96.0,98.0
mean,1147436.0,634437.4,-462566.4,89397.846154,2985081.0,51040.547945,711.323077,64.8,40.092308,40962500.0,792617.1,447177.4,1294855.0,-221885.7,273902.5,1111.369231,2156.061538,2590977.0,3527136.0
std,1505189.0,860364.6,809539.2,41143.391399,6004174.0,47596.682104,2074.497628,91.863214,88.901407,57364040.0,950464.5,1341564.0,2498335.0,205191.374121,171664.7,1165.852016,2811.676718,10566450.0,7182997.0
min,70000.0,-102500.0,-3504386.0,3285.0,3285.0,148.0,12.0,0.0,0.0,400000.0,71023.0,2.0,44093.0,-560222.0,477.0,2.0,57.0,148.0,-44093.0
25%,450000.0,76567.5,-552703.2,101250.0,400478.0,18834.0,19.0,10.0,0.0,20681250.0,275000.0,972.0,268922.0,-389621.75,206121.0,178.0,517.0,302402.5,421151.8
50%,750000.0,195190.0,-117534.0,108579.0,850010.0,41953.0,45.0,28.0,7.0,40962500.0,422158.0,52382.0,462822.5,-139856.5,251654.0,599.0,1088.0,1106740.0,997971.0
75%,1000000.0,834205.2,-27083.25,112492.0,2165172.0,59175.0,215.0,88.0,27.0,61243750.0,831809.0,362096.0,966490.5,-77953.25,288589.0,1902.0,2649.0,1985668.0,2493616.0
max,8000000.0,2964506.0,-1042.0,125034.0,34348380.0,228763.0,14368.0,528.0,411.0,81525000.0,5145434.0,10359730.0,14761690.0,44093.0,1111258.0,4527.0,15149.0,103559800.0,49110080.0


In [15]:
# 將train中的poi當作label取出來放到train_Y
df_train['poi'] = df_train['poi'].astype(float)
train_Y = df_train['poi']
ids = df_test['name']
# train資料拿掉label，並拿掉不需要的data(姓名和mail跟犯罪無關)
df_train = df_train.drop(['name', 'email_address', 'poi'] , axis=1)
df_test = df_test.drop(['name', 'email_address'] , axis=1)

# 先將兩筆資料合起來一起做處理，最後再分開做predict
df = pd.concat([df_train, df_test])
df

Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,loan_advances,long_term_incentive,other,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
0,1750000.0,,-3504386.0,,19794175.0,46950.0,18.0,42.0,4.0,,1617011.0,174839.0,2748364.0,,420636.0,864.0,905.0,505050.0,22542539.0
1,5600000.0,,,,19250000.0,29336.0,108.0,88.0,30.0,,1920000.0,22122.0,6843672.0,,1111258.0,2042.0,3627.0,8682716.0,26093672.0
2,200000.0,,-4167.0,,1624396.0,22884.0,39.0,13.0,14.0,,,1573324.0,869220.0,,211844.0,91.0,225.0,2003885.0,2493616.0
3,800000.0,,,,,118134.0,,,,,602671.0,907502.0,985032.0,,224305.0,,,2652612.0,985032.0
4,1250000.0,,-262500.0,,,35818.0,144.0,199.0,25.0,,375304.0,486.0,126027.0,,240189.0,2188.0,2598.0,1639297.0,126027.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28,1000000.0,,,,1465734.0,38559.0,40.0,23.0,8.0,,369721.0,425688.0,378082.0,,213625.0,1336.0,1607.0,2047593.0,1843816.0
29,1500000.0,,,,1835558.0,,92.0,28.0,23.0,,554422.0,1852186.0,1293424.0,,428780.0,2103.0,3187.0,4335388.0,3128982.0
30,,504610.0,,,2218275.0,,12.0,0.0,0.0,,461912.0,,,,,23.0,169.0,966522.0,2218275.0
31,200000.0,204075.0,,,2549361.0,57727.0,,,,,175000.0,2630.0,514847.0,,236457.0,,,875889.0,3064208.0


# 補缺失值

In [16]:
# 查看缺失值情況
def na_check(df_data):
    na = (df_data.isnull().sum() / len(df_data))*100
    na = na.drop(na[na==0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio': na})
    display(missing_data.head(10))
na_check(df)

Unnamed: 0,Missing Ratio
loan_advances,97.260274
director_fees,88.356164
restricted_stock_deferred,87.671233
deferral_payments,73.287671
deferred_income,66.438356
long_term_incentive,54.794521
bonus,43.835616
from_this_person_to_poi,41.09589
from_poi_to_this_person,41.09589
from_messages,41.09589


In [17]:
# loan_advances(高額貸款)缺值補0
df["loan_advances"] = df["loan_advances"].fillna(0)

In [18]:
# director_fees 補0
df["director_fees"] = df["director_fees"].fillna(0)

In [19]:
# restricted_stock_deferred 補0
df["restricted_stock_deferred"] = df["restricted_stock_deferred"].fillna(0)

In [20]:
# deferral_payments 補0
df["deferral_payments"] = df["deferral_payments"].fillna(0)

In [21]:
# deferred_income 補0
df["deferred_income"] = df["deferred_income"].fillna(0)

In [22]:
# long_term_incentive 補0
df["long_term_incentive"] = df["long_term_incentive"].fillna(0)

In [23]:
# bonus nan補0, 取中值, 用中值取代0
bonus_fillna = df["bonus"].fillna(0)
bonus_median = np.median(bonus_fillna)
df["bonus"] = df["bonus"].fillna(bonus_median)

In [24]:
# from_this_person_to_poi 補0
df["from_this_person_to_poi"] = df["from_this_person_to_poi"].fillna(0)

In [25]:
# from_poi_to_this_person 補0
df["from_poi_to_this_person"] = df["from_poi_to_this_person"].fillna(0)

In [26]:
# from_messages 補0
df["from_messages"] = df["from_messages"].fillna(0)

In [27]:
na_check(df)

Unnamed: 0,Missing Ratio
to_messages,41.09589
shared_receipt_with_poi,41.09589
other,36.30137
salary,34.931507
expenses,34.931507
exercised_stock_options,30.136986
restricted_stock,24.657534
total_payments,14.383562
total_stock_value,13.69863


In [28]:
df = df.fillna(0)

In [29]:
na_check(df)

Unnamed: 0,Missing Ratio


In [30]:
df.describe()

Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,loan_advances,long_term_incentive,other,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
count,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0
mean,1464981.0,438796.5,-382762.2,19422.49,4182736.0,70748.27,358.60274,38.226027,24.287671,1149658.0,664683.9,585431.8,1749257.0,20516.37,365811.4,692.986301,1221.589041,4350622.0,5846018.0
std,8073567.0,2741325.0,2378250.0,119054.3,26070400.0,432716.3,1441.259868,73.901124,79.278206,9649342.0,4046072.0,3682345.0,10899950.0,1439661.0,2203575.0,1072.969492,2226.770637,26934480.0,36246810.0
min,70000.0,-102500.0,-27992890.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2604490.0,-7576788.0,0.0,0.0,0.0,0.0,-44093.0
25%,300000.0,0.0,-37926.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8115.0,0.0,0.0,0.0,0.0,93944.75,228869.5
50%,300000.0,0.0,0.0,0.0,608293.5,20182.0,16.5,2.5,0.0,0.0,0.0,959.5,360528.0,0.0,210596.0,102.5,289.0,941359.5,965955.0
75%,800000.0,9684.5,0.0,0.0,1714221.0,53740.75,51.25,40.75,13.75,0.0,375064.8,150606.5,814528.0,0.0,270850.5,893.5,1585.75,1968287.0,2319991.0
max,97343620.0,32083400.0,0.0,1398517.0,311764000.0,5235198.0,14368.0,528.0,609.0,83925000.0,48521930.0,42667590.0,130322300.0,15456290.0,26704230.0,5521.0,15149.0,309886600.0,434509500.0


# 去離群/偏態

In [33]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.regplot(x = df['to_messages'], y = train_Y)

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 146 and the array at index 1 has size 113

# 標準化

In [None]:
df_mm = MinMaxScaler().fit_transform(df)
df_sc = StandardScaler().fit_transform(df)

# 拆分資料

In [20]:
train_num = train_Y.shape[0]
train_X = df[:train_num]
test_X = df[train_num:]

In [21]:
x_train, x_test, y_train, y_test = train_test_split(train_X, train_Y, test_size=0.25, random_state=4)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((84, 19), (29, 19), (84,), (29,))

# 訓練模型、測試

## NAIBE BAYES

In [25]:
# NAIBE BAYES
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(x_train, y_train) #ravel()轉一維 #訓練模型
predicted = model.predict(x_test) #用test set進行預測
print('Naive Bayes',accuracy_score(y_test, predicted))

Naive Bayes 0.896551724137931


## GradientBoostingClassifier

In [26]:
from sklearn.ensemble import GradientBoostingClassifier
gdbt_model = GradientBoostingClassifier()
gdbt_model.fit(x_train, y_train)
predicted = model.predict(x_test)
print('GradientBoostingClassifier',accuracy_score(y_test, predicted))

GradientBoostingClassifier 0.896551724137931


## KNeighborsClassifier

In [27]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier()
knn_model.fit(x_train, y_train.values.ravel())
predicted = knn_model.predict(x_test)
print('KNN', accuracy_score(y_test, predicted))

KNN 0.8620689655172413


## RANDOM FOREST

In [28]:
from sklearn.ensemble import RandomForestClassifier
rfc_model = RandomForestClassifier(n_estimators = 100, max_depth=4, random_state = 99)
rfc_model.fit(x_train, y_train.values.ravel())
predicted = rfc_model.predict(x_test)
print('Random Forest', accuracy_score(y_test, predicted))

Random Forest 0.8620689655172413


## LOGISTIC REGRESSION

In [29]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(solver = 'saga') #solver:優化器選擇，default=’lbfgs’
lr_model.fit(x_train,y_train.values.ravel())
lr_predicted = lr_model.predict(x_test)
print('Logistic Regression',accuracy_score(y_test, lr_predicted))

Logistic Regression 0.8620689655172413


## SVM

In [30]:
#SVM
from sklearn.svm import SVC
svc_model = SVC(gamma = 'auto')
svc_model.fit(x_train,y_train.values.ravel())
svc_predicted = svc_model.predict(x_test)
print('SVM',accuracy_score(y_test, svc_predicted))

SVM 0.8620689655172413


## DECISON TREE

In [31]:
#DECISON TREE
from sklearn.tree import DecisionTreeClassifier
dtree_model = DecisionTreeClassifier()
dtree_model.fit(x_train, y_train.values.ravel())
dtree_predicted = dtree_model.predict(x_test)
print('Decision Tree', accuracy_score(y_test, dtree_predicted))

Decision Tree 0.7931034482758621


## XGBOOST

In [32]:
#XGBOOST
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(x_train,y_train.values.ravel())
xgb_predicted = xgb.predict(x_test)
print('XGBoost',accuracy_score(y_test, xgb_predicted))

ModuleNotFoundError: No module named 'xgboost'

# 匯出CSV提交

In [35]:
gdbt_model.fit(x_train, y_train)
gdbt_predict = gdbt_model.predict(test_X)
sub = pd.DataFrame({'name': ids, 'poi': gdbt_predict})
sub.to_csv('result.csv', index=False)