# IndiaHacks Machine Learning Challenge
### Predict the road sign 
https://www.hackerearth.com/problem/machine-learning/predict-the-road-sign-1/

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neural_network import MLPClassifier as MLP
from sklearn.model_selection import GridSearchCV as GD
from sklearn.preprocessing import PolynomialFeatures as PF
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import GradientBoostingClassifier as GB
from sklearn.ensemble import ExtraTreesClassifier as ETC
from sklearn.ensemble import VotingClassifier as VC
from xgboost import XGBClassifier as XGB
from sklearn.metrics import log_loss

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
#encode as integer
mapping = {'Front':0, 'Right':1, 'Left':2, 'Rear':3}
train = train.replace({'DetectedCamera':mapping})
test = test.replace({'DetectedCamera':mapping})
#renaming column
train.rename(columns = {'SignFacing (Target)': 'Target'}, inplace=True)
#encode Target Variable based on sample submission file
mapping = {'Front':0, 'Left':1, 'Rear':2, 'Right':3}
train = train.replace({'Target':mapping})
#target variable
labels = train['Target']
test_id = test['Id']
#drop columns
train.drop(['Target','Id'], inplace=True, axis=1)
test.drop('Id',inplace=True,axis=1)

In [3]:
#train_test_split
X_train,X_test,y_train,y_test = train_test_split(train,labels,train_size=0.8,random_state=0) 
y_train = np.array(y_train)
#train_test_poly
poly = PF(interaction_only=True,degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly= poly.transform(X_test)
test_poly = poly.transform(test)

#train_test_scaled
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
test_scaled = scaler.transform(test)

#train_test_poly_scaled
scaler = MinMaxScaler()
X_train_poly_scaled = scaler.fit_transform(X_train_poly)
X_test_poly_scaled = scaler.transform(X_test_poly)
test_poly_scaled = scaler.transform(test_poly)

In [4]:
RF_clf = RF(n_estimators=1000,max_depth=10,oob_score=True,random_state=0)
XGB_clf = XGB(n_estimators=100,max_depth=5,random_state=0)
LR_clf = LR(penalty='l2',random_state=0)
KNN_clf = KNN(n_neighbors=350)
GB_clf = GB(learning_rate=0.1,n_estimators=100,random_state=0)
MLP_clf = MLP(hidden_layer_sizes=[1000,500,100,10],solver='adam',random_state=0)
ETC_clf = ETC(n_estimators=100,max_depth=15,random_state=0)

In [5]:
#vc_clf = VC(estimators=[('RF',RF_clf),('XGB',XGB_clf),('GB',GB_clf),('MLP',MLP_clf),('ETC',ETC_clf),('LR',LR_clf)],
#           voting='soft', weights=[1,1,1,1,1,0.5])
vc_clf = VC(estimators=[('RF',RF_clf),('XGB',XGB_clf),('GB',GB_clf),('MLP',MLP_clf),('ETC',ETC_clf),('KNN',KNN_clf)],
           voting='soft', weights=[1,1,1,1,1,1])
vc_clf.fit(X_train_poly_scaled,y_train)
print(100-log_loss(y_test,vc_clf.predict_proba(X_test_poly_scaled)))

99.8891527039


In [7]:
pred = vc_clf.predict_proba(X_train_poly_scaled)
diff = []
for i,y in enumerate(y_train):
    diff.append(1-pred[i,y])
diff = np.array(diff)

In [8]:
to_del = []
for i,p in enumerate(diff>0.9):
    if p==True:
        to_del.append(i)
X_train_poly_scaled = np.delete(X_train_poly_scaled,to_del,axis=0)
y_train = np.delete(y_train,to_del,axis=0)

In [9]:
RF_clf = RF(n_estimators=1000,max_depth=10,oob_score=True,random_state=0)
XGB_clf = XGB(n_estimators=100,max_depth=5,random_state=0)
KNN_clf = KNN(n_neighbors=350)
GB_clf = GB(learning_rate=0.1,n_estimators=100,random_state=0)
MLP_clf = MLP(hidden_layer_sizes=[1000,500,100,10],solver='adam',random_state=0)
ETC_clf = ETC(n_estimators=100,max_depth=15,random_state=0)
vc_clf = VC(estimators=[('RF',RF_clf),('XGB',XGB_clf),('GB',GB_clf),('MLP',MLP_clf),('ETC',ETC_clf),('KNN',KNN_clf)],
           voting='soft', weights=[1,1,1,1,1,1])
vc_clf.fit(X_train_poly_scaled,y_train)
print(100-log_loss(y_test,vc_clf.predict_proba(X_test_poly_scaled)))

99.8889100295


In [10]:
#predict on test data
pred = vc_clf.predict_proba(test_poly_scaled)

In [11]:
#write submission file and submit
columns = ['Front','Left','Rear','Right']
sub = pd.DataFrame(data=pred, columns=columns)
sub['Id'] = test_id
sub = sub[['Id','Front','Left','Rear','Right']]
#sub.to_csv("result_2.csv", index=False,float_format='%0.8f') #99.90006
sub.to_csv("result_3.csv", index=False,float_format='%0.8f') #99.89996