In [98]:
import pandas as pd
import numpy as np
import datetime

# SELECT train data 
train = pd.read_csv('./train_nt6.csv')

test = pd.read_csv('./test01_13.csv')
test2 = pd.read_csv('./test02_13.csv')

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.utils import shuffle

In [99]:
train.shape

(493, 8)

In [100]:
test.shape

(150, 8)

In [101]:
test2.shape

(222, 8)

In [102]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set() # setting seaborn default for plots

In [103]:
'''
def bar_chart(feature):
    work = train[train['work']==1][feature].value_counts()
    dead = train[train['work']==0][feature].value_counts()
    df = pd.DataFrame([work,dead])
    df.index = ['work','dead']
    df.plot(kind='bar',stacked=True, figsize=(10,5))

bar_chart('temp_avg_gap')
bar_chart('status')
'''

"\ndef bar_chart(feature):\n    work = train[train['work']==1][feature].value_counts()\n    dead = train[train['work']==0][feature].value_counts()\n    df = pd.DataFrame([work,dead])\n    df.index = ['work','dead']\n    df.plot(kind='bar',stacked=True, figsize=(10,5))\n\nbar_chart('temp_avg_gap')\nbar_chart('status')\n"

In [104]:
train_data = train.drop('work', axis=1)

test = test.drop('work', axis=1)
test2 = test2.drop('work', axis=1)

target = train["work"]

train_data.shape, test.shape, test2.shape

((493, 7), (150, 7), (222, 7))

In [105]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 8 columns):
temp_avg_gap    493 non-null float64
hum_avg_gap     493 non-null float64
temp_max_min    493 non-null float64
temp_min_max    493 non-null float64
hum_max_min     493 non-null float64
hum_min_max     493 non-null float64
status          493 non-null int64
work            493 non-null int64
dtypes: float64(6), int64(2)
memory usage: 30.9 KB


In [106]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

In [107]:
clf = KNeighborsClassifier(n_neighbors = 7)
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

[0.8        0.78       0.78       0.85714286 0.79591837 0.85714286
 0.93877551 0.85714286 0.73469388 0.87755102]


In [108]:
# kNN Score
round(np.mean(score)*100, 2)

82.78

In [109]:
clf = DecisionTreeClassifier()
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

[0.9        0.9        0.98       0.85714286 0.87755102 0.97959184
 0.93877551 0.89795918 0.85714286 0.91836735]


In [110]:
# decision tree Score
round(np.mean(score)*100, 2)

91.07

In [111]:
clf = RandomForestClassifier(n_estimators=7)
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

[0.86       0.94       0.92       0.91836735 0.89795918 0.91836735
 0.93877551 0.93877551 0.91836735 0.91836735]


In [112]:
# Random Forest Score
round(np.mean(score)*100, 2)

91.69

In [113]:
clf = GaussianNB()
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

[0.74       0.66       0.74       0.73469388 0.73469388 0.63265306
 0.6122449  0.6122449  0.59183673 0.67346939]


In [114]:
# Naive Bayes Score
round(np.mean(score)*100, 2)

67.32

In [115]:
clf = SVC()
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

[0.74       0.74       0.78       0.79591837 0.79591837 0.83673469
 0.83673469 0.7755102  0.71428571 0.73469388]


In [116]:
round(np.mean(score)*100,2)

77.5

In [117]:
clf = KNeighborsClassifier(n_neighbors = 7)
clf.fit(train_data, target)

test_data = test.copy()
test2_data = test2.copy()

predict = clf.predict(test_data)
predict2 = clf.predict(test2_data)

result =pd.DataFrame({
    'status': test['status'],
    'work': predict
})

result2 =pd.DataFrame({
    'status': test2['status'],
    'work': predict2
})

result.to_csv('result_KNN.csv', index=False)
result2.to_csv('result2_KNN.csv', index=False)

result_KNN = pd.read_csv('result_KNN.csv')
result2_KNN = pd.read_csv('result2_KNN.csv')

result_KNN.head()

Unnamed: 0,status,work
0,33,0
1,33,1
2,32,1
3,32,1
4,32,1


In [118]:
clf = DecisionTreeClassifier()
clf.fit(train_data, target)

test_data = test.copy()
test2_data = test2.copy()

predict = clf.predict(test_data)
predict2 = clf.predict(test2_data)

result =pd.DataFrame({
    'status': test['status'],
    'work': predict
})

result2 =pd.DataFrame({
    'status': test2['status'],
    'work': predict2
})

result.to_csv('result_DT.csv', index=False)
result2.to_csv('result2_DT.csv', index=False)

result_DT = pd.read_csv('result_DT.csv')
result2_DT = pd.read_csv('result2_DT.csv')

result_DT.head()

Unnamed: 0,status,work
0,33,0
1,33,1
2,32,1
3,32,1
4,32,1


In [119]:
clf = RandomForestClassifier(n_estimators=7)
clf.fit(train_data, target)

test_data = test.copy()
test2_data = test2.copy()

predict = clf.predict(test_data)
predict2 = clf.predict(test2_data)

result =pd.DataFrame({
    'status': test['status'],
    'work': predict
})

result2 =pd.DataFrame({
    'status': test2['status'],
    'work': predict2
})

result.to_csv('result_RF.csv', index=False)
result2.to_csv('result2_RF.csv', index=False)

result_RF = pd.read_csv('result_RF.csv')
result2_RF = pd.read_csv('result2_RF.csv')

result_RF.head()

Unnamed: 0,status,work
0,33,1
1,33,1
2,32,1
3,32,1
4,32,1


In [120]:
clf = GaussianNB()
clf.fit(train_data, target)

test_data = test.copy()
test2_data = test2.copy()

predict = clf.predict(test_data)
predict2 = clf.predict(test2_data)

result =pd.DataFrame({
    'status': test['status'],
    'work': predict
})

result2 =pd.DataFrame({
    'status': test2['status'],
    'work': predict2
})

result.to_csv('result_NB.csv', index=False)
result2.to_csv('result2_NB.csv', index=False)

result_NB = pd.read_csv('result_NB.csv')
result2_NB = pd.read_csv('result2_NB.csv')

result_NB.head()

Unnamed: 0,status,work
0,33,1
1,33,1
2,32,1
3,32,1
4,32,0


In [121]:
clf = SVC()
clf.fit(train_data, target)

test_data = test.copy()
test2_data = test2.copy()

predict = clf.predict(test_data)
predict2 = clf.predict(test2_data)

result =pd.DataFrame({
    'status': test['status'],
    'work': predict
})

result2 =pd.DataFrame({
    'status': test2['status'],
    'work': predict2
})

result.to_csv('result_SVC.csv', index=False)
result2.to_csv('result2_SVC.csv', index=False)

result_SVC = pd.read_csv('result_SVC.csv')
result2_SVC = pd.read_csv('result2_SVC.csv')

result_SVC.head()

Unnamed: 0,status,work
0,33,0
1,33,1
2,32,1
3,32,1
4,32,1


In [122]:
print("KNN count: ", result_KNN['work'].value_counts()[1]/(result_KNN['work'].value_counts()[1] + result_KNN['work'].value_counts()[0]))
print("DT  count: ", result_DT['work'].value_counts()[1]/(result_DT['work'].value_counts()[1] + result_DT['work'].value_counts()[0]))
print("RF  count: ", result_RF['work'].value_counts()[1]/(result_RF['work'].value_counts()[1] + result_RF['work'].value_counts()[0]))
print("NB  count: ", result_NB['work'].value_counts()[1]/(result_NB['work'].value_counts()[1] + result_NB['work'].value_counts()[0]))
print("SVC count: ", result_SVC['work'].value_counts()[1]/(result_SVC['work'].value_counts()[1] + result_SVC['work'].value_counts()[0]))

KNN count:  0.5666666666666667
DT  count:  0.6866666666666666
RF  count:  0.6666666666666666
NB  count:  0.36666666666666664
SVC count:  0.5266666666666666


In [123]:
print("KNN count: ", result2_KNN['work'].value_counts()[0]/(result2_KNN['work'].value_counts()[1] + result2_KNN['work'].value_counts()[0]))
print("DT  count: ", result2_DT['work'].value_counts()[0]/(result2_DT['work'].value_counts()[1] + result2_DT['work'].value_counts()[0]))
print("RF  count: ", result2_RF['work'].value_counts()[0]/(result2_RF['work'].value_counts()[1] + result2_RF['work'].value_counts()[0]))
print("NB  count: ", result2_NB['work'].value_counts()[0]/(result2_NB['work'].value_counts()[1] + result2_NB['work'].value_counts()[0]))
print("SVC count: ", result2_SVC['work'].value_counts()[0]/(result2_SVC['work'].value_counts()[1] + result2_SVC['work'].value_counts()[0]))

KNN count:  0.536036036036036
DT  count:  0.6621621621621622
RF  count:  0.8063063063063063
NB  count:  0.6621621621621622
SVC count:  0.8018018018018018


In [124]:
print("KNN count: ", (result_KNN['work'].value_counts()[1] +  result2_KNN['work'].value_counts()[0]) / (result_KNN['work'].value_counts()[1] + result_KNN['work'].value_counts()[0] + result2_KNN['work'].value_counts()[1] + result2_KNN['work'].value_counts()[0]))
print("DT  count: ", (result_DT['work'].value_counts()[1] +  result2_DT['work'].value_counts()[0]) / (result_DT['work'].value_counts()[1] + result_DT['work'].value_counts()[0] + result2_DT['work'].value_counts()[1] + result2_DT['work'].value_counts()[0]))
print("RF  count: ", (result_RF['work'].value_counts()[1] +  result2_RF['work'].value_counts()[0]) / (result_RF['work'].value_counts()[1] + result_RF['work'].value_counts()[0] + result2_RF['work'].value_counts()[1] + result2_RF['work'].value_counts()[0]))
print("NB  count: ", (result_NB['work'].value_counts()[1] +  result2_NB['work'].value_counts()[0]) / (result_NB['work'].value_counts()[1] + result_NB['work'].value_counts()[0] + result2_NB['work'].value_counts()[1] + result2_NB['work'].value_counts()[0]))
print("SVC count: ", (result_SVC['work'].value_counts()[1] +  result2_SVC['work'].value_counts()[0]) / (result_SVC['work'].value_counts()[1] + result_SVC['work'].value_counts()[0] + result2_SVC['work'].value_counts()[1] + result2_SVC['work'].value_counts()[0]))

KNN count:  0.5483870967741935
DT  count:  0.6720430107526881
RF  count:  0.75
NB  count:  0.543010752688172
SVC count:  0.6908602150537635
