<a href="https://colab.research.google.com/github/turtle98/datamining/blob/main/takehomeexam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [114]:
# !pip install IPython
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 1. 모듈 불러오기

In [42]:
''' 기본 모듈 및 시각화 모듈 '''
from IPython.display import display, HTML
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import os
from sklearn.model_selection import StratifiedKFold
import scipy.stats

''' 데이터 전처리 모듈 '''
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

''' Neural Network Classifier 모듈 '''
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

''' 결과 평가용 모듈 '''
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.metrics import accuracy_score
''' 기타 optional'''
import warnings, itertools
warnings.filterwarnings(action='ignore')
pd.set_option('display.max_columns', None)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [43]:
def seed_everything(seed):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)
  print('done')


In [44]:
seed_everything(42)

done


### 데이터 불러오기

In [45]:
df1 = pd.read_csv('/content/drive/MyDrive/bada/train_data.csv', delimiter = ',')
df2 = pd.read_csv('/content/drive/MyDrive/bada/test_data.csv', delimiter = ',')
df3 = pd.read_csv('/content/drive/MyDrive/bada/adult.csv', delimiter = ',')
df4 = pd.read_csv('/content/drive/MyDrive/bada/abalone_original.csv', delimiter = ',')

In [46]:
df = pd.concat([df1,df2])

**Classifier 만들기**

In [108]:
def pre_mlp(df):
  #df를 독립변수 X와 종속 변수 Y로 분리시키자
  if df.shape[1] == 9:
      X = df.iloc[:,1:]
      y = df.iloc[:,0]
      train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.5, random_state=42)
      return train_x, test_x, train_y, test_y
  X = df.iloc[:,:-1]
  y = df.iloc[:,-1]
  train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.5, random_state=42)
  return train_x, test_x, train_y, test_y

class mlp_clf:
    def __init__(self, df):
        self.df = df
        #데이터 전처리
        self.train_x, self.test_x, self.train_y, self.test_y = self.getdata()
        self.modelA = self.getmodel_A()
        self.modelB = self.getmodel_B()

    def getdata(self):
        train_x, test_x, train_y, test_y = pre_mlp(self.df)
        train_x = pd.get_dummies(train_x, drop_first = True)
        test_x = pd.get_dummies(test_x, drop_first = True)
        test_x = test_x.reindex(columns = train_x.columns, fill_value=0)
        le = preprocessing.LabelEncoder()
        train_y=le.fit_transform(train_y)
        test_y=le.transform(test_y)
    
        return train_x, test_x, train_y, test_y
    
    def getmodel_A(self):
        scaler = StandardScaler()
        scaler.fit(self.train_x)
        self.train_x = scaler.transform(self.train_x)
        self.test_x = scaler.transform(self.test_x)
        parameter_space = {
        'hidden_layer_sizes': [(5,3), (30,15), (50,25)],
        'momentum': [0.9, 0.99],
        'alpha': [0.01, 0.03],
        'max_iter': [100,300]
        }
        model = MLPClassifier(batch_size=self.train_x.shape[0], solver = 'adam', activation='relu', random_state = 42)
        clf = GridSearchCV(model, parameter_space, n_jobs=-1, cv=2)
        best_model = clf.fit(self.train_x, self.train_y)
        return best_model

    def getmodel_B(self):
        scaler = StandardScaler()
        scaler.fit(self.test_x)
        self.test_x = scaler.transform(self.test_x)
        self.train_x = scaler.transform(self.train_x)
        parameter_space = {
        'hidden_layer_sizes': [(5,3), (30,15), (50,25)],
        'momentum': [0.9, 0.99],
        'alpha': [0.01, 0.03],
        'max_iter': [100,300]
        }
        model = MLPClassifier(batch_size=self.test_x.shape[0], solver = 'adam', activation='relu', random_state = 42)
        clf = GridSearchCV(model, parameter_space, n_jobs=-1, cv=2)
        best_model = clf.fit(self.test_x, self.test_y)
        return best_model

    def bestparams(self):
        return self.modelA.best_params_, self.modelB.best_params_


    def predict(self):
        y_pred_A = self.modelA.predict(self.test_x)
        y_pred_B = self.modelB.predict(self.train_x)
        predictions_A = [round(value) for value in y_pred_A]
        predictions_B = [round(value) for value in y_pred_B]
        confidence = 0.95  
        z_value = scipy.stats.norm.ppf((1 + confidence) / 2.0)
        # 평가하기
        accuracy_A = accuracy_score(self.test_y, predictions_A)
        accuracy_B = accuracy_score(self.train_y, predictions_B)
        ci_length_A = z_value * np.sqrt((accuracy_A* (1 - accuracy_A)) / self.test_y.shape[0])
        ci_lower_A = (1-accuracy_A) - ci_length_A
        ci_upper_A = (1-accuracy_A) + ci_length_A
        ci_length_B = z_value * np.sqrt((accuracy_B* (1 - accuracy_B)) / self.train_y.shape[0])
        ci_lower_B = (1-accuracy_B) - ci_length_B
        ci_upper_B = (1-accuracy_B) + ci_length_B
        #av_err = ((1-accuracy_A) + (1-accuracy_B))/2
        #std_err = z_value * np.sqrt((1-accuracy_A-av_err)^2 + (1-accuracy_B-av_err)^2)
        #ci_lower_av = av_err - std_err
        #ci_upper_av = av_err + std_err
        #print('Accuracy trained on A : %.2f%%'%(accuracy_A*100))
        #print('Accuracy trained on B : %.2f%%'%(accuracy_B*100))
        #print('Average Accuracy : %.2f%%'%((accuracy_B*100 + accuracy_B*100)/2))
        print('error trained on A : %.2f%%'%((1-accuracy_A)*100))
        print('error trained on B : %.2f%%'%((1-accuracy_B)*100))
        print('Average error : %.2f%%'%(((1-accuracy_B)*100 + (1-accuracy_B)*100)/2))
        #print(f'lower & upper confidence interval of Average error : {ci_lower_av, ci_upper_av}')
        print(f'lower & upper confidence interval on A: {ci_lower_A, ci_upper_A}')
        print(f'lower & upper confidence interval on B: {ci_lower_B, ci_upper_B}')



In [109]:
## decision tree 
class DT:
  def __init__(self, df):
        self.df = df
        #데이터 전처리
        self.train_x, self.test_x, self.train_y, self.test_y = self.getdata()
        self.modelA = self.getmodel_A()
        self.modelB = self.getmodel_B()

  def getdata(self):
        train_x, test_x, train_y, test_y = pre_mlp(self.df)
        train_x = pd.get_dummies(train_x, drop_first = True)
        test_x = pd.get_dummies(test_x, drop_first = True)
        test_x = test_x.reindex(columns = train_x.columns, fill_value=0)
        le = preprocessing.LabelEncoder()
        train_y=le.fit_transform(train_y)
        test_y=le.transform(test_y)
    
        return train_x, test_x, train_y, test_y      

  def getmodel_A(self):
        model = DecisionTreeClassifier(random_state = 42)
        best_model = model.fit(self.train_x, self.train_y)
        return best_model
        
  def getmodel_B(self):
        model = DecisionTreeClassifier(random_state = 42)
        best_model = model.fit(self.test_x, self.test_y)
        return best_model
        
  def predict(self):
        y_pred_A = self.modelA.predict(self.test_x)
        y_pred_B = self.modelB.predict(self.train_x)
        predictions_A = [round(value) for value in y_pred_A]
        predictions_B = [round(value) for value in y_pred_B]
        confidence = 0.95  
        z_value = scipy.stats.norm.ppf((1 + confidence) / 2.0)
        # 평가하기
        accuracy_A = accuracy_score(self.test_y, predictions_A)
        accuracy_B = accuracy_score(self.train_y, predictions_B)
        ci_length_A = z_value * np.sqrt((accuracy_A* (1 - accuracy_A)) / self.test_y.shape[0])
        ci_lower_A = (1-accuracy_A) - ci_length_A
        ci_upper_A = (1-accuracy_A) + ci_length_A
        ci_length_B = z_value * np.sqrt((accuracy_B* (1 - accuracy_B)) / self.train_y.shape[0])
        ci_lower_B = (1-accuracy_B) - ci_length_B
        ci_upper_B = (1-accuracy_B) + ci_length_B
        #av_err = ((1-accuracy_A) + (1-accuracy_B))/2
        #std_err = z_value * np.sqrt((1-accuracy_A-av_err)^2 + (1-accuracy_B-av_err)^2)
        #ci_lower_av = av_err - std_err
        #ci_upper_av = av_err + std_err
        #print('Accuracy trained on A : %.2f%%'%(accuracy_A*100))
        #print('Accuracy trained on B : %.2f%%'%(accuracy_B*100))
        #print('Average Accuracy : %.2f%%'%((accuracy_B*100 + accuracy_B*100)/2))
        print('error trained on A : %.2f%%'%((1-accuracy_A)*100))
        print('error trained on B : %.2f%%'%((1-accuracy_B)*100))
        print('Average error : %.2f%%'%(((1-accuracy_B)*100 + (1-accuracy_B)*100)/2))
        #print(f'lower & upper confidence interval of Average error : {ci_lower_av, ci_upper_av}')
        print(f'lower & upper confidence interval on A: {ci_lower_A, ci_upper_A}')
        print(f'lower & upper confidence interval on B: {ci_lower_B, ci_upper_B}')



In [88]:
dt = DT(df)
dt.predict()

error trained on A : 12.69%
error trained on B : 12.71%
Average error : 12.71%
lower & upper confidence interval on A: (0.12257392212706797, 0.13125249563812702)
lower & upper confidence interval on B: (0.12271013429380188, 0.13139294024722875)


In [89]:
dt1 = DT(df3)
dt1.predict()

error trained on A : 18.23%
error trained on B : 18.89%
Average error : 18.89%
lower & upper confidence interval on A: (0.1774597393192417, 0.1871444947416076)
lower & upper confidence interval on B: (0.18394501558155127, 0.19376269499541113)


In [110]:
dt2 = DT(df4)
dt2.predict()

error trained on A : 49.69%
error trained on B : 53.02%
Average error : 53.02%
lower & upper confidence interval on A: (0.47544768904748214, 0.5183292377117328)
lower & upper confidence interval on B: (0.5087651745019826, 0.5515796530842243)


In [70]:
## banking dataset
mlp= mlp_clf(df)
mlp.predict()
mlp.bestparams()

error trained on A : 10.00%
error trained on B : 9.80%
Average error : 9.80%
lower & upper confidence interval on A: (0.09610666003065865, 0.10392872880416387)
lower & upper confidence interval on B: (0.09411159012248577, 0.10186275183725774)


({'alpha': 0.03,
  'hidden_layer_sizes': (30, 15),
  'max_iter': 300,
  'momentum': 0.9},
 {'alpha': 0.01,
  'hidden_layer_sizes': (50, 25),
  'max_iter': 300,
  'momentum': 0.9})

In [79]:
#adult
mlp1 = mlp_clf(df3)


error trained on A : 14.93%
error trained on B : 15.09%
Average error : 15.09%
lower & upper confidence interval on A: (0.14482800057222783, 0.15376747053870127)
lower & upper confidence interval on B: (0.14636491711799832, 0.15534262966550766)


In [81]:
mlp1.predict()
mlp1.bestparams()

error trained on A : 14.93%
error trained on B : 15.09%
Average error : 15.09%
lower & upper confidence interval on A: (0.14482800057222783, 0.15376747053870127)
lower & upper confidence interval on B: (0.14636491711799832, 0.15534262966550766)


({'alpha': 0.01,
  'hidden_layer_sizes': (50, 25),
  'max_iter': 300,
  'momentum': 0.9},
 {'alpha': 0.03,
  'hidden_layer_sizes': (50, 25),
  'max_iter': 300,
  'momentum': 0.9})

In [111]:
mlp2 = mlp_clf(df4)
mlp2.predict()
mlp2.bestparams()

error trained on A : 42.32%
error trained on B : 42.58%
Average error : 42.58%
lower & upper confidence interval on A: (0.4019824379499029, 0.4443555227968658)
lower & upper confidence interval on B: (0.40455764346626233, 0.44697492358354607)


({'alpha': 0.01,
  'hidden_layer_sizes': (50, 25),
  'max_iter': 300,
  'momentum': 0.9},
 {'alpha': 0.03,
  'hidden_layer_sizes': (30, 15),
  'max_iter': 300,
  'momentum': 0.9})

In [97]:
#comparing performance of two algorithmns
#Banking data set
d_1 = (12.69-10)
d_2 = (12.71-9.8)
d_bar = (d_1+d_2)/2
samp_var_d = (d_1-d_bar)**2 +(d_2-d_bar)**2 

#95% confidence for true difference d_t
d_t_upper  = d_bar + 6.314*np.sqrt((samp_var_d)/2)
d_t_lower  = d_bar - 6.314*np.sqrt((samp_var_d)/2)

print(d_t_upper)
print(d_t_lower)

3.494540000000002
2.1054599999999977


In [98]:
#comparing performance of two algorithmns
#Adult data set
d_1 = (18.23-14.93)
d_2 = (18.89-15.09)
d_bar = (d_1+d_2)/2
samp_var_d = (d_1-d_bar)**2 +(d_2-d_bar)**2 

#95% confidence for true difference d_t
d_t_upper  = d_bar + 6.314*np.sqrt((samp_var_d)/2)
d_t_lower  = d_bar - 6.314*np.sqrt((samp_var_d)/2)

print(d_t_upper)
print(d_t_lower)

5.128500000000001
1.9715000000000007


In [112]:
#comparing performance of two algorithmns
#Abalone data set
d_1 = (49.69-42.32)
d_2 = (53.02-42.58)
d_bar = (d_1+d_2)/2
samp_var_d = (d_1-d_bar)**2 +(d_2-d_bar)**2 

#95% confidence for true difference d_t
d_t_upper  = d_bar + 6.314*np.sqrt((samp_var_d)/2)
d_t_lower  = d_bar - 6.314*np.sqrt((samp_var_d)/2)

print(d_t_upper)
print(d_t_lower)

18.596990000000027
-0.7869900000000225


In [None]:
/content/drive/MyDrive/Colab Notebooks/takehomeexam.ipynb