In [709]:
import pandas as pd
import random
import os
import numpy as np

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier

In [710]:
class CFG:
    SEED = 42

In [711]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG.SEED) # Seed 고정

In [712]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [713]:
def get_x_y(df):
  if 'class' in df.columns:
    df_x = df.drop(columns=['id','class'])
    df_y = df['class']
    return df_x, df_y
  else :
    df_x = df.drop(columns=['id'])
    return df_x
    
train_x, train_y = get_x_y(train)
test_x = get_x_y(test)

In [714]:

class_le = preprocessing.LabelEncoder()
snp_le = preprocessing.LabelEncoder()
snp_col = [f'SNP_{str(x).zfill(2)}' for x in range(1,16)]

snp_data = []
for col in snp_col:
    snp_data += list(train_x[col].values)

train_y = class_le.fit_transform(train_y)
snp_le.fit(snp_data)

for col in train_x.columns:
    if col in snp_col:
        train_x[col] = snp_le.transform(train_x[col])
        test_x[col] = snp_le.transform(test_x[col])

In [715]:
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report

In [716]:
sampler = SMOTE()
sampler.fit(train_x, train_y)
train_x, train_y = sampler.fit_resample(train_x, train_y)

In [717]:
pipe_list = [('impute', SimpleImputer()),
             ('scaler',StandardScaler()),
             ('model', SVC())]
pip_model = Pipeline(pipe_list)

In [718]:
hyper_parameter = {'model__C':[0.001],
                  'model__kernel':['linear'],
                  'model__class_weight':[None, 'balanced']}
grid_model = GridSearchCV(pip_model, 
                         param_grid=hyper_parameter,
                         cv=5, n_jobs = -1, scoring='f1')

grid_model.fit(train_x, train_y)



GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('impute', SimpleImputer()),
                                       ('scaler', StandardScaler()),
                                       ('model', SVC())]),
             n_jobs=-1,
             param_grid={'model__C': [0.001],
                         'model__class_weight': [None, 'balanced'],
                         'model__kernel': ['linear']},
             scoring='f1')

In [719]:
best_model = grid_model.best_estimator_
Y_train_pred = best_model.predict(train_x)
Y_test_pred = best_model.predict(test_x)

In [720]:
submit = pd.read_csv('rmSPN1,2,6,12_c0.001SVM.csv')
Y_test = submit["class"]

In [721]:
Y_test_pred = pd.DataFrame(Y_test_pred)
Y_test_pred = Y_test_pred.replace({0:'A',1:'B',2:'C'})
Y_test_pred.value_counts()

B    87
A    51
C    37
dtype: int64

In [722]:
print(mean_squared_error(train_y,Y_train_pred)) #0

0.04093567251461988


In [723]:
r2_score(train_y, Y_train_pred) #1

0.9385964912280702

In [724]:
print(classification_report(train_y, Y_train_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       114
           1       0.91      0.97      0.94       114
           2       0.97      0.90      0.94       114

    accuracy                           0.96       342
   macro avg       0.96      0.96      0.96       342
weighted avg       0.96      0.96      0.96       342



In [725]:
print(classification_report(Y_test,Y_test_pred))

              precision    recall  f1-score   support

           A       1.00      1.00      1.00        51
           B       0.99      0.99      0.99        87
           C       0.97      0.97      0.97        37

    accuracy                           0.99       175
   macro avg       0.99      0.99      0.99       175
weighted avg       0.99      0.99      0.99       175



In [726]:
preds = grid_model.predict(test_x)
print('Done.')

Done.


In [727]:
submit = pd.read_csv('./sample_submission.csv')
submit['class'] = class_le.inverse_transform(preds)
submit.to_csv('./SMOTE_SVC.csv', index=False)