In [1]:
import pandas as pd 
import numpy as np 
import scipy.stats as stats 

In [2]:
df1 = pd.read_csv('train.csv')
print(df1.shape)

(262, 21)


In [3]:
df1['Target'] = df1['class'].replace({'A':0,'B':1,'C':2})

In [172]:
X = pd.get_dummies(df1.drop(columns=['id','father','mother','gender','class','Target',
                                    'SNP_12','SNP_06','SNP_02','SNP_01']))
#df1[['trait','SNP_01','SNP_02','SNP_08','SNP_11','SNP_13','SNP_14','SNP_15']]
Y = df1['Target']


In [100]:
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.impute   import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report

In [180]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3)

In [174]:
pipe_list = [('impute', SimpleImputer()),
            ('model', RandomForestClassifier())]
pipe_model = Pipeline(pipe_list)
pipe_model

In [181]:
hypeer_parameter = {'model__max_depth':[5], 
                    'model__min_samples_split':[6],
                    'model__min_samples_leaf':[8],
                    'model__class_weight':['balanced'],
                    'model__n_estimators':[100]}

grid_model = GridSearchCV(pipe_model, param_grid=hypeer_parameter, cv=5, 
                         n_jobs = -1 ,scoring='f1')
grid_model.fit(X_train, Y_train)



In [182]:
best_model = grid_model.best_estimator_
best_model

In [183]:
Y_train_pred = best_model.predict(X_train)
Y_test_pred  = best_model.predict(X_test)

In [184]:
print(classification_report(Y_train, Y_train_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        52
           1       0.95      0.96      0.96        79
           2       0.94      0.92      0.93        52

    accuracy                           0.96       183
   macro avg       0.96      0.96      0.96       183
weighted avg       0.96      0.96      0.96       183



In [185]:
print(classification_report(Y_test, Y_test_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        17
           1       0.97      0.97      0.97        35
           2       0.96      0.96      0.96        27

    accuracy                           0.97        79
   macro avg       0.98      0.98      0.98        79
weighted avg       0.97      0.97      0.97        79



In [186]:
df_importance = pd.DataFrame()
df_importance['Feature'] = X_train.columns
df_importance['importance'] = best_model['model'].feature_importances_

In [187]:
df_importance.sort_values(by='importance',ascending=False)

Unnamed: 0,Feature,importance
0,trait,0.144882
12,SNP_07_G G,0.103713
10,SNP_07_A A,0.097197
21,SNP_10_G G,0.089171
16,SNP_09_A A,0.07751
4,SNP_04_A A,0.070186
28,SNP_14_A A,0.062518
13,SNP_08_A A,0.041213
27,SNP_13_G G,0.035241
9,SNP_05_C C,0.034074


In [192]:
df2 = pd.read_csv('test.csv')
X1 = pd.get_dummies(df2.drop(columns=['id','father','mother','gender',
                                    'SNP_12','SNP_06','SNP_02','SNP_01']))

In [193]:
X1_test_pred = best_model.predict(X1)

In [194]:
X1_test_pred = pd.DataFrame(X1_test_pred)
X1_test_pred= X1_test_pred.replace({0:'A',1:'B',2:'C'})

X1_test_pred.value_counts()


B    86
A    51
C    38
dtype: int64

In [195]:
X1_test_pred['id'] = df2['id']
X1_test_pred['class'] = X1_test_pred[0]

In [196]:
X1_test_pred = X1_test_pred.drop(columns=[0])
X1_test_pred = X1_test_pred.set_index('id')

In [197]:
X1_test_pred.to_csv('RF_rmSNP.csv',mode='w')