In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split, KFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Data Input

df = pd.read_csv('../input/hw1-data/agaricus-lepiota.csv', header = None)
pd.set_option('display.max_columns', None)

In [None]:
#Data Visualization
plt.style.use('ggplot')
kwargs = dict(alpha=0.7, color = 'g', ec="k")

for fe in df.columns[1:]:
    plt.bar(np.unique(df[fe]), df.groupby(fe).size().values, **kwargs)
    plt.title("Feature" + str(fe))
    plt.xlabel("feature value")
    plt.ylabel("value frequency")
    plt.grid(True)
    plt.savefig(str(fe) + '.png')
    plt.show()

kwargs['color'] = 'b'    
for fe in df.loc[df[0] == 'e'].columns[1:]:
    plt.bar(np.unique(df.loc[df[0] == 'e'][fe]), df.loc[df[0] == 'e'].groupby(fe).size().values, **kwargs)
    plt.title("Feature" + str(fe))
    plt.xlabel("feature value")
    plt.ylabel("value frequency")
    plt.grid(True)
    plt.savefig('e_' + str(fe) + '.png')
    plt.show()
kwargs['color'] = 'orange' 
for fe in df.loc[df[0] == 'p'].columns[1:]:
    plt.bar(np.unique(df.loc[df[0] == 'p'][fe]), df.loc[df[0] == 'p'].groupby(fe).size().values, **kwargs)
    plt.title("Feature" + str(fe))
    plt.xlabel("feature value")
    plt.ylabel("value frequency")
    plt.grid(True)
    plt.savefig('p_' + str(fe) + '.png')
    plt.show()

# 
# plt.ylabel("value frequency")
# plt.xlabel("cm")
# plt.grid(True)
# plt.legend()

In [None]:
#Data Preprocessing

#drop missing value feature
df.drop([11], axis=1, inplace=True)
#shuffle the data
df = shuffle(df, random_state = 0)
df.reset_index(inplace=True, drop=True)
#prepare x,y for model
y = df[0].copy()
X = df.drop([0], axis=1).copy()
#do Ohe-Hot encoding
X = pd.get_dummies(X)

In [None]:
#Model Construction with Holdout validation

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=0)

clf = CategoricalNB()
# clf = CategoricalNB(alpha= 0)
clf.fit(X_train, y_train)
y_true, y_pred = y_valid, clf.predict(X_valid)

#Show result
print(pd.DataFrame(confusion_matrix(y_true, y_pred, labels = ['e', 'p']), index= ['actually e', 'actually p'], columns= ['predict e', 'predict p']))
print('\naccuracy  {:.7}'.format(accuracy_score(y_true, y_pred)))
print('\n          e          p\nrecall    {:.7}  {:.7}'.format(recall_score(y_true, y_pred, average = None)[0], recall_score(y_true, y_pred, average = None)[1]))
print('precision {:.7}  {:.7}'.format(precision_score(y_true, y_pred, average = None)[0], precision_score(y_true, y_pred, average = None)[1]))

In [None]:
#Model Construction with K-fold cross-validation

cfm = []
acc = []
sen = []
pre = []

kf = KFold(n_splits=3)
kf.get_n_splits(X)

for train_index, vali_index in kf.split(X):
#     print("TRAIN:", train_index, "TEST:", vali_index)
    X_train, X_valid = X.values[train_index], X.values[vali_index]
    y_train, y_valid = y.values[train_index], y.values[vali_index]
    
    clf = CategoricalNB()
#     clf = CategoricalNB(alpha= 0)
    clf.fit(X_train, y_train)
    y_true, y_pred = y_valid, clf.predict(X_valid)

    cfm.append(confusion_matrix(y_true, y_pred, labels = ['e', 'p']))
    acc.append(accuracy_score(y_true, y_pred))
    sen.append(recall_score(y_true, y_pred, average = None))
    pre.append(precision_score(y_true, y_pred, average = None))

#Show result
print(pd.DataFrame((cfm[0] + cfm[1] + cfm[2]) / 3, index= ['actually e', 'actually p'], columns= ['predict e', 'predict p']).round(0).astype(int))
print('\naccuracy  {:.7}'.format(sum(acc) / len(acc)))
print('\n          e          p\nrecall    {:.7}  {:.7}'.format(((sen[0] + sen[1] + sen[2]) / 3)[0], ((sen[0] + sen[1] + sen[2]) / 3)[1]))
print('precision {:.7}  {:.7}'.format(((pre[0] + pre[1] + pre[2]) / 3)[0], ((pre[0] + pre[1] + pre[2]) / 3)[1]))

In [None]:
#Question
fig_df = pd.DataFrame([0, 0, 0, 0, 0, 0, 0, 0, 0], index= ['b', 'c', 'e', 'g', 'n', 'o', 'p', 'w', 'y'], columns= ['prob'])
fig_df_lps = pd.DataFrame([0, 0, 0, 0, 0, 0, 0, 0, 0], index= ['b', 'c', 'e', 'g', 'n', 'o', 'p', 'w', 'y'], columns= ['prob'])
num_e = df.loc[df[0] == 'e'].shape[0]
num_e_and_fe14 = df.loc[df[0] == 'e'].groupby(14).size()
cat = list(num_e_and_fe14.index)
for i in range(0, len(cat)):
    fig_df.loc[cat[i], 'prob'] += (num_e_and_fe14[i] / num_e)
    fig_df_lps.loc[cat[i], 'prob'] += ((num_e_and_fe14[i] + 1) / (num_e + 1*9))
fig_df_lps.loc[fig_df_lps['prob'] == 0] = (1/(num_e + 1*9))

print(fig_df)
print(fig_df_lps)

In [None]:
plt.style.use('ggplot')
b = plt.bar(fig_df.index, fig_df['prob'], color = 'blue', alpha=0.7)
plt.title("without Laplace smoothing        ")
plt.xlabel("feature value")
plt.ylabel("probability")
plt.grid(True)
for item in b:
        height = item.get_height()
        plt.text(
            item.get_x() + item.get_width()/2., 
            height*1.05, 
            '%.4f' % float(height),
            ha = "center",
            va = "bottom",
            rotation = 60
        )
plt.savefig('without Laplace smoothing.png')
plt.show()

b = plt.bar(fig_df_lps.index, fig_df_lps['prob'], color = 'red', alpha=0.7)
plt.title("with Laplace smoothing        ")
plt.xlabel("feature value")
plt.ylabel("probability")
plt.grid(True)
for item in b:
        height = item.get_height()
        plt.text(
            item.get_x() + item.get_width()/2., 
            height*1.05, 
            '%.4f' % float(height),
            ha = "center",
            va = "bottom",
            rotation = 60
        )
plt.savefig('with Laplace smoothing.png')
plt.show()