<a href="https://colab.research.google.com/github/yeonghwanchoi/DS_class/blob/master/ACME.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import warnings
warnings.filterwarnings(action='ignore')

import os


import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns


from sklearn.ensemble import (AdaBoostClassifier as ABC, GradientBoostingClassifier as GBC,
                              RandomForestClassifier as RFC)
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.linear_model import LogisticRegression as LR
from lightgbm import LGBMClassifier as LGB
import xgboost as xgb
from xgboost import plot_importance

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score, KFold

from sklearn.svm import SVC as SVC

In [None]:
def load_data(name):
    directory = '/content/drive/MyDrive/colab/apziva/ACME/data/'
    path = os.path.join(directory, name)
    data = pd.read_csv(path, encoding = 'utf-8')
    return data
    
data=load_data('HappinessSurvey2020.csv')

In [None]:
%matplotlib inline

for i in data.columns[1:]:
    data[['Y',i]].hist()
plt.show()


In [None]:
for i in data.columns[1:]:
    data[data['Y']==1][i].hist(density=True)#blue
    data[data['Y']==0][i].hist(density=True)
    plt.show()

In [None]:
sns.heatmap(data.corr(),linewidths=0.1,vmax=1.0, 
            square=True, cmap=plt.cm.RdBu, linecolor='white', annot=True)

#Features

According to the visuals above it shows X1 and X5 are important **features**

In [None]:
tmp_list=['X1','X5','Y']
tmp=data[tmp_list]
sns.heatmap(tmp.corr(),linewidths=0.1,vmax=1.0, 
            square=True, cmap=plt.cm.RdBu, linecolor='white', annot=True)

#Model_list

In [None]:
models = []
models.append(('RandomForestClassifier', RFC()))
models.append(('DecisionTreeClassifier', DTC()))
models.append(('AdaBoostClassifier', ABC()))
models.append(('LogisticRegression', LR()))
models.append(('SupportVectorClassification', SVC()))
models.append(('XGBClassifier', xgb.XGBClassifier()))

#Funcs

due to the small number of data, split data 10:1


In [None]:
def data_split(feature):

    tmp = data[feature + ['Y']]
    train = tmp.drop('Y',axis=1)
    X_train, X_test, y_train, y_test = train_test_split(train, data['Y'],
                        test_size=0.10, random_state=42)
    
    return X_train, X_test, y_train, y_test

def result_table(models=models):
    train_score = []
    test_score = []
    names = []
    importance_score = []

    for name, model in models:
        clf = model
        clf.fit(X_train, y_train)
        
        y_pred_tr = clf.predict(X_train)
        y_pred_test = clf.predict(X_test)

        names.append(name)
        test_score.append(accuracy_score(y_test, y_pred_test))
        train_score.append(accuracy_score(y_train, y_pred_tr))         
        
        try:
            importance_score.append(np.round(clf.feature_importances_, 3))
        except:
            importance_score.append(0)

    result = pd.DataFrame({'model name':names, 
                    'train score':train_score, 
                    'test score':test_score,
                    'importance':importance_score}) 

    return result 

def kfold():

    names = []
    mean = []
    std = []

    for name, model in models:
        kfold = KFold(n_splits=5, random_state=13, shuffle=True)
        cv_results = cross_val_score(model, X_train, y_train, 
                                    cv=kfold, scoring='accuracy')
        names.append(name)
        mean.append(cv_results.mean())
        std.append(cv_results.std())

    result = pd.DataFrame({'name':names,'mean':mean,'std':std})
    return result

In [None]:
# from sklearn.model_selection import StratifiedShuffleSplit
# split = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
# for train_index, test_index in split.split(data, data['X3']):
#     X = data.loc[train_index]
#     y = data.loc[test_index]


# X_train, y_train, X_test, y_test = X[], X['Y'], y[y.columns[1:]], y['Y'] 


# model starts

start with all features

In [None]:
feature_list=['X1','X2','X3','X4','X5']

X_train, X_test, y_train, y_test=data_split(feature_list)

mean_std =kfold()
result_table()

In [None]:
mean_std

X1 and X5

In [None]:
feature_list=['X1','X5']

X_train, X_test, y_train, y_test=data_split(feature_list)

mean_std =kfold()
result_table()

In [None]:
mean_std

In [None]:
#could go with other variables 
feature_list=['X1','X5','X2']

X_train, X_test, y_train, y_test=data_split(feature_list)

result_table()

In [None]:
mean_std = kfold()
mean_std

In [None]:
feature_list=['X1','X5','X3']

X_train, X_test, y_train, y_test=data_split(feature_list)

result_table()

In [None]:
mean_std =kfold()
mean_std

In [None]:
feature_list=['X1','X5','X4']

X_train, X_test, y_train, y_test=data_split(feature_list)

result_table()

In [None]:
mean_std =kfold()
mean_std

In [None]:
feature_list=['X1','X5','X6']

X_train, X_test, y_train, y_test=data_split(feature_list)

result_table()

In [None]:
mean_std =kfold()
mean_std

feature 'X1' 'X5' with ensemble models exeed accuracy 0.76 

X1 MUST be the most important feature as you see the histograms and feature importance in the matrix above.

X2 and X4 could be eliminated

In [None]:
# from sklearn.model_selection import KFold
# ntrain = data_train.shape[0]
# ntest = data_test.shape[0]
# SEED = 0,; NFOLDS = 5 
# kf = KFold(ntrain, random_state=SEED)

# class SklearnWrapper(object):
#     def __init__(self, clf, seed=0, params=None):
#         params['random_state'] = seed
#         self.clf = clf(**params)

#     def train(self, x_train, y_train):
#         self.clf.fit(x_train, np.log(y_train))

#     def predict(self, x):
#         return np.exp(self.clf.predict(x))
    

In [None]:
# rf_params = {
#     'n_jobs': -1,
#     'n_estimators': 500,
#      'warm_start': True, 
#      #'max_features': 0.2,
#     'max_depth': 6,
#     'min_samples_leaf': 2,
#     'max_features' : 'sqrt',
#     'verbose': 0
# },
# ada_params = {
#     'n_estimators': 500,
#     'learning_rate' : 0.75
# },
# gb_params = {
#     'n_estimators': 500,
#      #'max_features': 0.2,
#     'max_depth': 5,
#     'min_samples_leaf': 2,
#     'verbose': 0
# },
# svc_params = {
#     'kernel' : 'linear',
#     'C' : 0.025
# }
