In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split, KFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score
import math
import scipy.stats as stats

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Data Input

df = pd.read_csv('../input/hw1-data/iris.csv', header = None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
df.rename(columns={0: "sepal length", 1: "sepal width", 2: "petal length", 3: "petal width", 4: "class"}, inplace = True)

In [None]:
df.describe()
df.columns

In [None]:
#Data visualization
data = df.describe()
index = np.arange(2)
bar_width = 0.2
alpha = 0.6
plt.style.use('ggplot')

for fe in df.columns[:4]:
    plt.bar(index,
            data[fe][1:3], 
            bar_width,
            alpha = alpha,
            label=fe)
    index = index + 0.2

plt.ylabel("cm")
# plt.xlabel("")
plt.title("Data distribution")
plt.xticks(index - 0.8 + 0.5 / 2 ,("average","standard deviation"))
plt.legend() 
plt.grid(True)
plt.savefig('iris_data_distribution.png')
plt.show()

In [None]:
data = df.describe()
index = np.array([0.1, 0.5])
bar_width = 0.1
alpha = 0.6
plt.style.use('ggplot')

for fe in df.columns[:4]:
    for c in np.unique(df['class']):
        plt.bar(index,
                df.loc[df['class'] == c].describe()[fe][1:3], 
                bar_width,
                alpha = alpha,
                label=c)
        index = index + 0.1
    plt.ylabel("cm")
    # plt.xlabel("")
    plt.title("feature distribution: " + fe)
    plt.xticks(index - 0.6 + 0.8 / 2 ,("average","standard deviation"))
    plt.legend() 
    plt.grid(True)
    plt.savefig('iris_by_class_' + fe + '_distribution.png')
    plt.show()

In [None]:
kwargs = dict(histtype='stepfilled', alpha=0.7, bins=10, ec="k")
for fe in df.columns[:4]:
    plt.style.use('ggplot')
    plt.hist(df[fe], **kwargs, label=fe)        
    
plt.title("Data distribution")
plt.ylabel("value frequency")
plt.xlabel("cm")
plt.grid(True)
plt.legend()
plt.savefig('iris_freture_frequency.png')
plt.show()

In [None]:
kwargs = dict(histtype='stepfilled', alpha=0.6, bins=10, ec="k")
for fe in df.columns[:4]:
    for c in np.unique(df['class']):
        plt.style.use('ggplot')
        plt.hist(df.loc[df['class'] == c][fe], **kwargs, label=c)        
    
    plt.title("feature distribution: " + fe)
    plt.ylabel("Value Frequency")
    plt.xlabel("cm")
    plt.grid(True)
    plt.legend()
    plt.savefig('iris_by_class_' + fe + '.png')
    plt.show()

In [None]:
df.info()

In [None]:
#Data Preprocessing

#drop missing value feature

#shuffle the data
df = shuffle(df, random_state =0)
df.reset_index(inplace=True, drop=True)
#prepare x,y for model
y = df['class'].copy()
X = df.drop(['class'], axis=1).copy()
np.unique(y)

In [None]:
#Model Construction with Holdout validation

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=0)

clf = GaussianNB()
# clf = CategoricalNB(alpha= 0)
clf.fit(X_train, y_train)
y_true, y_pred = y_valid, clf.predict(X_valid)

#Show result
print(pd.DataFrame(confusion_matrix(y_true, y_pred, labels = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']), index= ['actually setosa', 'actually versicolor', 'actually virginica'], columns= ['predict setosa', 'predict versicolor', 'predict virginica']))
print('\naccuracy  {:.7}'.format(accuracy_score(y_true, y_pred)))
print('\n          setosa   versicolor   virginica\nrecall    {:.7}      {:.7}    {:.7}'.format(recall_score(y_true, y_pred, average = None)[0], recall_score(y_true, y_pred, average = None)[1], recall_score(y_true, y_pred, average = None)[2]))
print('precision {:.7}      {:.7}          {:.7}'.format(precision_score(y_true, y_pred, average = None)[0], precision_score(y_true, y_pred, average = None)[1], recall_score(y_true, y_pred, average = None)[2]))

In [None]:
#Question
x = pd.concat([X_train, y_train],  axis=1)
x = (x.loc[x['class'] == 'Iris-versicolor']['petal length']).to_numpy()
mean = np.mean(x)
std = np.std(x)

kwargs = dict(histtype='stepfilled', alpha=0.7, density=True, bins=10, ec="k")
plt.hist(x, **kwargs)
xmin, xmax = plt.xlim()
x_axis = np.linspace(xmin, xmax, 100)
y_axis = stats.norm.pdf(x_axis, mean, std)

plt.style.use('ggplot')
plt.title("pdf")
plt.text(3.05, 1.01, "μ = {:.7f}\nσ = {:.7f}".format(mean, std),fontsize=14)
plt.ylabel("Density")
plt.xlabel("petal length(cm)")
plt.grid(True)
plt.plot(x_axis, y_axis)
plt.savefig('iris_pdf.png')
plt.show()

# print('mean:{:.7f}\nstd:{:.7f}'.format(mean, std))

In [None]:
#Model Construction with K-fold cross-validation

cfm = []
acc = []
sen = []
pre = []

kf = KFold(n_splits=3)
kf.get_n_splits(X)

for train_index, vali_index in kf.split(X):
#     print("TRAIN:", train_index, "TEST:", vali_index)
    X_train, X_valid = X.values[train_index], X.values[vali_index]
    y_train, y_valid = y.values[train_index], y.values[vali_index]
    
    clf = GaussianNB()
    # clf = CategoricalNB(alpha= 0)
    clf.fit(X_train, y_train)
    y_true, y_pred = y_valid, clf.predict(X_valid)

    cfm.append(confusion_matrix(y_true, y_pred, labels = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']))
    acc.append(accuracy_score(y_true, y_pred))
    sen.append(recall_score(y_true, y_pred, average = None))
    pre.append(precision_score(y_true, y_pred, average = None))

#Show result
print(pd.DataFrame((cfm[0] + cfm[1] + cfm[2]) / 3, index= ['actually setosa', 'actually versicolor', 'actually virginica'], columns= ['predict setosa', 'predict versicolor', 'predict virginica']).round(0).astype(int))
print('\naccuracy  {:.7}'.format(sum(acc) / len(acc)))
print('\n          setosa   versicolor   virginica\nrecall    {:.7}      {:.7}    {:.7}'.format(((sen[0] + sen[1] + sen[2]) / 3)[0], ((sen[0] + sen[1] + sen[2]) / 3)[1], ((sen[0] + sen[1] + sen[2]) / 3)[2]))
print('precision {:.7}      {:.7}    {:.7}'.format(((pre[0] + pre[1] + pre[2]) / 3)[0], ((pre[0] + pre[1] + pre[2]) / 3)[1], ((sen[0] + sen[1] + sen[2]) / 3)[2]))

In [None]:
#Question
# fig_df = pd.DataFrame([0, 0, 0, 0, 0, 0, 0, 0, 0], index= ['b', 'c', 'e', 'g', 'n', 'o', 'p', 'w', 'y'], columns= ['prob']) 
# num_e = df.loc[df[0] == 'e'].shape[0]
# num_e_and_fe14 = df.loc[df[0] == 'e'].groupby(14).size()
# cat = list(num_e_and_fe14.index)
# for i in range(0, len(cat)):
#     fig_df.loc[cat[i], 'prob'] += (num_e_and_fe14[i] / num_e)
# fig_df