## Pandas - Extracting data

In [51]:
import pandas as pd
import numpy as np

df = pd.read_csv('C:/Vindico/Projects/Data/Kaggle/Competition/otto group/train.csv')

## Pandas - Cleaning data

In [52]:
df.target.unique()
df['target'] = df['target'].map({'Class_1': 0,'Class_2': 1, 'Class_3': 2, 'Class_4': 3, 'Class_5': 4
                                , 'Class_6': 5, 'Class_7': 6, 'Class_8': 7, 'Class_9': 8}).astype(int)

cols = df.columns.tolist()
cols = [cols[-1]] + cols[0:1] + cols[1:-1]
df = df[cols]

train_data = df.values

In [53]:
df.head()

Unnamed: 0,target,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,...,feat_84,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93
0,0,1,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,2,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,3,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,4,1,0,0,1,6,1,5,0,...,22,0,1,2,0,0,0,0,0,0
4,0,5,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0


## Scikit-learn - Training the model

In [54]:
# Scikit-learn:
# # Model:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

# # PCA:
from sklearn.decomposition import PCA, RandomizedPCA

# # Metrics:
from sklearn.cross_validation import cross_val_score, train_test_split, KFold
from sklearn.metrics import accuracy_score, roc_curve, auc, classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest, chi2, SelectPercentile, f_classif

In [55]:
X = train_data[:, 2:]
y = train_data[:, 0]

In [56]:
selector = SelectPercentile(f_classif, percentile=10)
X = selector.fit_transform(X, y)

In [57]:
model = RandomForestClassifier(n_estimators = 100, max_features=0.5, max_depth=5)
# model = model.fit(train_data[0:,2:],train_data[0:,0])
model = model.fit(X,y)

## Scikit-learn - Making predictions

In [80]:
df_test = pd.read_csv('C:/Vindico/Projects/Data/Kaggle/Competition/otto group/test.csv')

Similarly we fill in the NAs in the test data with the better performer.

In [81]:
test = df_test.values
test

array([[     1,      0,      0, ...,      0,      0,      0],
       [     2,      2,      2, ...,      0,      2,      0],
       [     3,      0,      1, ...,      0,      0,      1],
       ..., 
       [144366,      0,      1, ...,      1,      0,      0],
       [144367,      0,      0, ...,      0,      1,      0],
       [144368,      0,      0, ...,      0,      0,      0]], dtype=int64)

In [82]:
test_data = df_test.ix[:,1:].values
test_data

array([[ 0,  0,  0, ...,  0,  0,  0],
       [ 2,  2, 14, ...,  0,  2,  0],
       [ 0,  1, 12, ...,  0,  0,  1],
       ..., 
       [ 0,  1,  0, ...,  1,  0,  0],
       [ 0,  0,  0, ...,  0,  1,  0],
       [ 0,  0,  0, ...,  0,  0,  0]], dtype=int64)

In [83]:
test_data = selector.transform(test_data)

output = model.predict(test_data[:,:])

In [84]:
output

array([1, 5, 5, ..., 1, 1, 1], dtype=int64)

## Pandas - Preparing for submission

In [109]:
result = np.c_[test[:,0].astype(int), output.astype(int)]
df_result = pd.DataFrame(result[:,0:2], columns=['id', 'Class'])

df_result.head()

Unnamed: 0,id,Class
0,1,1
1,2,5
2,3,5
3,4,8
4,5,7


In [110]:
df_result.Class.unique()

array([1, 5, 8, 7, 4, 3, 2], dtype=int64)

In [111]:
df_result['Class'] = df_result['Class'].map({0:'Class_1', 1:'Class_2', 2:'Class_3', 3:'Class_4', 4:'Class_5',
                                             5:'Class_6', 6:'Class_7', 7:'Class_8', 8:'Class_9'})
df_result.head()

Unnamed: 0,id,Class
0,1,Class_2
1,2,Class_6
2,3,Class_6
3,4,Class_9
4,5,Class_8


In [112]:
df_result = pd.concat([df_result, pd.get_dummies(df_result['Class'])], axis=1)
df_result = df_result.drop(['Class'], axis=1)
df_result.head()

Unnamed: 0,id,Class_2,Class_3,Class_4,Class_5,Class_6,Class_8,Class_9
0,1,1,0,0,0,0,0,0
1,2,0,0,0,0,1,0,0
2,3,0,0,0,0,1,0,0
3,4,0,0,0,0,0,0,1
4,5,0,0,0,0,0,1,0


In [113]:
# df_result.Class_1 = df_result.Class_1.astype(int)
df_result.Class_2 = df_result.Class_2.astype(int)
df_result.Class_3 = df_result.Class_3.astype(int)
df_result.Class_4 = df_result.Class_4.astype(int)
df_result.Class_5 = df_result.Class_5.astype(int)
df_result.Class_6 = df_result.Class_6.astype(int)
df_result.Class_8 = df_result.Class_8.astype(int)
df_result.Class_9 = df_result.Class_9.astype(int)
df_result.dtypes

id         int32
Class_2    int32
Class_3    int32
Class_4    int32
Class_5    int32
Class_6    int32
Class_8    int32
Class_9    int32
dtype: object

In [114]:
df_result.insert(1, 'Class_1', 0)
df_result.insert(7, 'Class_7', 0)
df_result.head()

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9
0,1,0,1,0,0,0,0,0,0,0
1,2,0,0,0,0,0,1,0,0,0
2,3,0,0,0,0,0,1,0,0,0
3,4,0,0,0,0,0,0,0,0,1
4,5,0,0,0,0,0,0,0,1,0


In [115]:
df_result.Class_1 = df_result.Class_1.astype(int)
df_result.Class_7 = df_result.Class_7.astype(int)

In [116]:
df_result.dtypes

id         int32
Class_1    int32
Class_2    int32
Class_3    int32
Class_4    int32
Class_5    int32
Class_6    int32
Class_7    int32
Class_8    int32
Class_9    int32
dtype: object

In [117]:
df_result.head()

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9
0,1,0,1,0,0,0,0,0,0,0
1,2,0,0,0,0,0,1,0,0,0
2,3,0,0,0,0,0,1,0,0,0
3,4,0,0,0,0,0,0,0,0,1
4,5,0,0,0,0,0,0,0,1,0


In [118]:
df_result.to_csv('otto_RandomForest.csv', index=False)