In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score, confusion_matrix
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

%matplotlib inline



In [2]:
df = pd.read_csv('train.csv')

# create dummy variables
#just_dummies = pd.get_dummies(df['bag_num'],prefix='bag_')

#df = pd.concat([df, just_dummies], axis=1)      
df.drop(['bag_num'], inplace=True, axis=1)

df = df.reset_index(drop=True)

df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,546,547,548,549,550,551,552,553,554,label
0,4.267247,-0.546598,-2.50608,-2.268732,-1.965279,-2.263128,-2.407531,-2.030531,-2.314407,-1.735125,...,-5.226312,-3.707105,-7.794694,-3.557639,-1.395715,-7.158656,-0.302812,-0.932784,-2.300241,5.0
1,4.116929,-0.441998,-2.329116,-2.275494,-1.949666,-2.379057,-2.416471,-2.013451,-2.400011,-1.750614,...,-5.2442,-3.721543,-7.795036,-3.572315,-1.601865,-7.158656,0.644038,-1.85838,-2.78987,5.0
2,4.135179,-0.524323,-2.139451,-2.268962,-1.933447,-2.42519,-2.410937,-1.990225,-2.449587,-1.742491,...,-5.251723,-3.718466,-7.794977,-3.569483,-1.601865,-3.977031,1.688794,-1.22033,-2.461512,5.0
3,4.128095,-0.705681,-2.324635,-2.270585,-1.965863,-2.454252,-2.412339,-2.029633,-2.479243,-1.742491,...,-5.251014,-3.719177,-7.794987,-3.574765,-1.530896,-6.704138,1.648336,-0.366304,-1.563638,5.0
4,4.09046,-0.446283,-2.175281,-2.275251,-1.960694,-2.453773,-2.415295,-2.023277,-2.482097,-1.749503,...,-5.259906,-3.718699,-7.794988,-3.575702,-1.601865,-6.704138,0.357529,-1.097665,-2.264298,5.0


### Cross validation: 

In [3]:
def run_kfold(clf, df):
    kf = KFold(n_splits=10, shuffle=True)
    label_num = df['label'].nunique()
    outcomes = np.zeros([label_num, label_num])
    fold = 0
    for train_index, test_index in kf.split(df):
        fold += 1
        
        # split train & test set
        train, test = df.iloc[train_index].copy(), df.iloc[test_index].copy()
        
        # split x, y
        X_train = train.drop(['label'], axis=1).values
        X_test = test.drop(['label'], axis=1).values
        y_train = list(train['label'].values)
        y_test = list(test['label'].values)
         
        # train classifier
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        
        c_matrix = confusion_matrix(y_test, predictions)
        outcomes = outcomes + c_matrix
        
        print("Fold:",fold) 
        #print(np.size(X_train,0), np.size(X_train_rus,0), np.size(X_test,0))
        print(c_matrix)
    print('average:')
    np.set_printoptions(suppress=True)
    print(outcomes / 10)
    

In [5]:
rf = DecisionTreeClassifier()
run_kfold(rf, df)

Fold: 1
[[96  4  3  0  0  0]
 [ 3 60  1  0  0  0]
 [ 1  3 57  0  0  0]
 [ 0  0  0 75  8  0]
 [ 0  0  0  8 75  0]
 [ 0  0  0  0  0 78]]
Fold: 2
[[71  3  0  0  0  0]
 [ 4 80  1  0  0  0]
 [ 2  4 70  0  0  0]
 [ 0  0  0 67 10  0]
 [ 0  0  0 12 64  0]
 [ 0  0  0  0  0 84]]
Fold: 3
[[80  3  1  0  0  0]
 [ 5 74  5  0  0  0]
 [ 4  3 61  0  0  0]
 [ 0  0  0 73 13  0]
 [ 0  0  0  6 70  0]
 [ 0  0  0  0  0 74]]
Fold: 4
[[77  5  2  0  0  0]
 [ 3 78  2  0  0  0]
 [ 1  1 68  0  0  0]
 [ 0  0  0 67  6  0]
 [ 0  0  0  7 64  0]
 [ 0  0  0  0  0 91]]
Fold: 5
[[75  2  2  0  0  0]
 [ 2 69  6  0  0  0]
 [ 4  3 56  0  0  0]
 [ 0  0  0 60  5  0]
 [ 0  0  0  8 85  0]
 [ 0  0  0  0  0 95]]
Fold: 6
[[81  3  0  0  0  0]
 [ 4 55  5  0  0  0]
 [ 0  2 64  0  0  0]
 [ 0  0  0 74  9  0]
 [ 0  0  0  7 81  0]
 [ 0  0  0  0  0 87]]
Fold: 7
[[68  4  1  0  0  0]
 [ 3 79  3  0  0  0]
 [ 2  8 70  0  0  0]
 [ 0  0  0 75  8  0]
 [ 0  0  0  4 71  0]
 [ 0  0  0  0  0 76]]
Fold: 8
[[98  3  4  0  0  0]
 [ 3 72  1  0  0  0]
 [ 3 

### Predict test set:

In [6]:
test_df = pd.read_csv('test.csv')

test_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,545,546,547,548,549,550,551,552,553,554
0,4.190615,-0.334002,-1.566038,-2.24986,-1.895723,-2.351584,-2.394901,-1.962179,-2.36236,-1.703653,...,-4.027569,-5.224459,-3.64185,-7.792128,-3.500826,-0.964631,-6.476879,0.151521,-0.773041,-1.854825
1,4.182251,-0.304212,-2.34652,-2.254565,-1.940968,-2.393738,-2.398867,-2.005524,-2.416319,-1.72216,...,-4.043123,-5.169643,-3.660637,-7.79296,-3.527836,-1.150763,-7.158656,-0.064403,-0.326228,-1.754202
2,4.884058,0.142949,-0.968704,-0.496546,-0.237465,-0.651201,-0.526435,-0.30302,-0.602412,-0.163223,...,-2.912192,-3.656741,-2.338574,-7.266058,-2.240987,0.411616,-5.340585,0.847915,-0.522249,-1.924811
3,3.692136,-0.257338,-2.022309,-2.145843,-1.852379,-2.365756,-2.278687,-1.919724,-2.391045,-1.648959,...,-3.955365,-5.248963,-3.618839,-7.789293,-3.477197,-0.859243,-7.158656,-0.309448,0.429705,-0.818019
4,4.115187,-0.351935,-1.935575,-2.27564,-1.970764,-2.439753,-2.415642,-2.035021,-2.466583,-1.750236,...,-4.114053,-5.236435,-3.726561,-7.795106,-3.582252,-1.601865,-6.931397,1.867965,-2.337221,-2.995538


In [7]:
# make prediction
X_train = df.drop(['label'], axis=1).values
y_train = list(df['label'].values)
X_test = test_df.values

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

In [8]:
# construct dataframe
idx = np.arange(1, np.size(predictions) + 1)
predicts = np.array([idx, predictions])
predicts = np.int32(predicts)

df_predict = pd.DataFrame(predicts.T, columns=['Id', 'Class'])
df_predict.head()

Unnamed: 0,Id,Class
0,1,4
1,2,6
2,3,1
3,4,6
4,5,5


In [9]:
# output csv
df_predict.to_csv('predict.csv', index=False)