In [1]:
import pandas as pd

In [2]:
import sklearn
from sklearn import linear_model
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import KFold



In [3]:
train = pd.read_csv("train.csv")

In [4]:
train_copy = train.copy()
#split the train copy data
categorical = train_copy.loc[:,'id':'cat116'] 
#add one-hot encoded of the data
df_train = pd.get_dummies(categorical)
df_train = pd.merge(train_copy,df_train, on='id')
df_train = df_train.select_dtypes(include=['float64','uint8'])

print 'size of the df_train: ' + str(df_train.shape)



size of the df_train: (188318, 1154)


In [5]:
df_train.head(1)

Unnamed: 0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,...,cat116_P,cat116_Q,cat116_R,cat116_S,cat116_T,cat116_U,cat116_V,cat116_W,cat116_X,cat116_Y
0,0.7263,0.245921,0.187583,0.789639,0.310061,0.718367,0.33506,0.3026,0.67135,0.8351,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
#move loss columns to the end
df_train['loss_moved'] = df_train['loss']
del df_train['loss']
df_train.head(1)

Unnamed: 0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,...,cat116_Q,cat116_R,cat116_S,cat116_T,cat116_U,cat116_V,cat116_W,cat116_X,cat116_Y,loss_moved
0,0.7263,0.245921,0.187583,0.789639,0.310061,0.718367,0.33506,0.3026,0.67135,0.8351,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2213.18


In [7]:
#reduce data size  for easier to train
df_experiment = df_train.loc[:2000,:]

#split x_train and y_train
train_x , test_x, train_y, test_y = train_test_split(
    df_experiment.loc[:,'cont1':'cat116_Y'], 
    df_experiment.loc[:,'loss_moved'], test_size = 0.3, random_state=42)

# Fit bayesian ridge evaluated by variance in sklearn metrics


clf = linear_model.BayesianRidge()
clf.fit(train_x, train_y)

#print out the accuracy metrics
prediction = clf.predict(test_x)
real = np.array(test_y)
accuracy_test = mean_absolute_error(test_y,prediction)
accuracy_train = mean_absolute_error(train_y, clf.predict(train_x))
print "Bayesian Ridge method :"
print "Mean absolute errors in test: %.6f" % (accuracy_test) 
print "Mean absolute errors in train: %.6f" % (accuracy_train)

# Try with random forest


estimators = range(10,100,10)
#tunning parameters for the regression trees
for i in estimators:
    clf = RandomForestRegressor(n_estimators = i)
    clf = clf.fit(train_x,train_y)

    #print out the accuracy metrics
    prediction = clf.predict(test_x)
    real = np.array(test_y)
    predict_y_train = np.array(clf.predict(train_x))
    accuracy_test = mean_absolute_error(test_y,prediction)
    accuracy_train = mean_absolute_error(train_y, predict_y_train)
    print "n_estimators:", i
    print "Mean absolute errors in test: %.6f" % (accuracy_test) 
    print "Mean absolute errors in train: %.6f" % (accuracy_train) 

# Extra tree regressor

from sklearn.ensemble import ExtraTreesRegressor
for i in estimators:
    clf = ExtraTreesRegressor(n_estimators = i,)
    clf = clf.fit(train_x,train_y)

    #print out the accuracy metrics
    prediction = clf.predict(test_x)
    real = np.array(test_y)
    accuracy = mean_absolute_error(test_y,prediction)
    print "n_estimators:", i
    print "Mean absolute errors: %.6f" % (accuracy) 

range(10,100,10)


# let's try to optimize random forest features


In [8]:
#doing feature engineering

#1./ remove massive catergorical data 
train_3000 = train_copy.loc[:2000,:]
df_features = pd.get_dummies(train_3000)
df_features['loss_moved'] = df_features['loss']
del df_features['loss']
df_features.shape

(2001, 754)

In [9]:
#create new split x and split y
train_x , test_x, train_y, test_y = train_test_split(
    df_features.loc[:,'cont1':'cat114_U'], 
    df_features.loc[:,'loss_moved'], test_size = 0.3, random_state=42)

1. Testing with k fold cross validation

In [None]:
estimators = 60 #already a best feature
features= "auto" #max features
criteria = "mae"
#tunning parameters for the regression trees
params ={
    'n_estimators': range(50,90,5),
    'criterion' : ('mse','mae'),
    'max_features' : ('auto','sqrt','log2')
}

RFR = RandomForestRegressor()
from sklearn import grid_search
clf = grid_search.GridSearchCV(RFR, params)
clf = clf.fit(train_x,train_y)

#print out the accuracy metrics
prediction = clf.predict(test_x)
real = np.array(test_y)
predict_y_train = np.array(clf.predict(train_x))
accuracy_test = mean_absolute_error(test_y,prediction)
accuracy_train = mean_absolute_error(train_y, predict_y_train)
print "Parameters: "
print "n_estimators: ", estimators

print ""
print "Mean absolute errors in test: %.6f" % (accuracy_test) 
print "Mean absolute errors in train: %.6f" % (accuracy_train)
print ""



In [None]:
#performing grid search for the parameters


# Apply linear ridge model


clf = linear_model.BayesianRidge()
clf.fit(train_x,train_y)
#print out the accuracy metrics
prediction = clf.predict(test_x)
real = np.array(test_y)
predict_y_train = np.array(clf.predict(train_x))
accuracy_test = mean_absolute_error(test_y,prediction)
accuracy_train = mean_absolute_error(train_y, predict_y_train)
print "Parameters: "
print ""
print "Mean absolute errors in test: %.6f" % (accuracy_test) 
print "Mean absolute errors in train: %.6f" % (accuracy_train)
print ""

# test with svm regression model


from sklearn import svm

clf = svm.SVR()
clf.fit(train_x,train_y)
#print out the accuracy metrics
prediction = clf.predict(test_x)
real = np.array(test_y)
predict_y_train = np.array(clf.predict(train_x))
accuracy_test = mean_absolute_error(test_y,prediction)
accuracy_train = mean_absolute_error(train_y, predict_y_train)
print "Parameters: "
print ""
print "Mean absolute errors in test: %.6f" % (accuracy_test) 
print "Mean absolute errors in train: %.6f" % (accuracy_train)
print ""

# Gradient boosting regressor

from sklearn.ensemble import GradientBoostingRegressor
clf = GradientBoostingRegressor(n_estimators=100, learning_rate=0.05)
clf = clf.fit(train_x,train_y)

#print out the accuracy metrics
prediction = clf.predict(test_x)
real = np.array(test_y)
predict_y_train = np.array(clf.predict(train_x))
accuracy_test = mean_absolute_error(test_y,prediction)
accuracy_train = mean_absolute_error(train_y, predict_y_train)
print "Parameters: "
print ""
print "Mean absolute errors in test: %.6f" % (accuracy_test) 
print "Mean absolute errors in train: %.6f" % (accuracy_train)
print ""