In [26]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split, StratifiedShuffleSplit
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier

In [2]:
data = pd.read_csv('complete.csv', sep=',', header=None)

In [3]:
y = data[1]

In [4]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,1,36000.0,32.0,276000.0,30,240,3.0,1.0,0.0
1,1,0,38000.0,35.0,631000.0,24,84,2.0,1.0,1.0
2,2,1,141000.0,61.0,645000.0,60,36,3.0,1.0,0.0
3,3,1,102000.0,24.0,442000.0,30,258,2.0,0.0,0.0
4,4,1,80000.0,51.0,182000.0,54,60,4.0,0.0,0.0


In [5]:
data.drop([0, 1], axis=1, inplace=True)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(data, y, train_size=.8, random_state=99)

In [7]:
data.shape

(10000, 8)

In [8]:
X_test.shape

(2000, 8)

# Random Forest

In [9]:
rf_model = RandomForestClassifier()

In [10]:
rf_model.get_params().keys()

dict_keys(['oob_score', 'min_samples_leaf', 'bootstrap', 'verbose', 'min_weight_fraction_leaf', 'n_estimators', 'warm_start', 'n_jobs', 'class_weight', 'max_features', 'criterion', 'random_state', 'max_depth', 'max_leaf_nodes', 'min_samples_split'])

In [11]:
from sklearn.grid_search import GridSearchCV

parameters_grid = {
    'n_estimators' : [5, 10, 20, 50, 100, 200, 500],
    'max_features' :['auto', 'sqrt', 'log2'],
    'max_depth' :[None, 2, 5, 10, 50],
    'n_jobs' :[-1, 1]
    
}
    
grid = GridSearchCV(rf_model, parameters_grid, scoring='mean_absolute_error', cv=4)

In [12]:
grid.fit(X_train, y_train)

GridSearchCV(cv=4, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 10, 20, 50, 100, 200, 500], 'n_jobs': [-1, 1], 'max_depth': [None, 2, 5, 10, 50], 'max_features': ['auto', 'sqrt', 'log2']},
       pre_dispatch='2*n_jobs', refit=True, scoring='mean_absolute_error',
       verbose=0)

In [14]:
grid.best_score_

-0.0015

In [15]:
grid.best_params_

{'max_depth': None, 'max_features': 'auto', 'n_estimators': 100, 'n_jobs': -1}

In [16]:
y_rf = grid.best_estimator_.predict(X_test)

In [17]:
from sklearn.metrics import mean_absolute_error

# SGD

In [28]:
classifier = SGDClassifier(random_state = 0)

In [29]:
classifier.get_params().keys()

dict_keys(['power_t', 'alpha', 'penalty', 'fit_intercept', 'eta0', 'warm_start', 'verbose', 'n_iter', 'shuffle', 'n_jobs', 'class_weight', 'epsilon', 'average', 'learning_rate', 'random_state', 'l1_ratio', 'loss'])

In [33]:
parameters_grid = {
    'loss' : ['hinge', 'log', 'squared_hinge', 'squared_loss'],
    'penalty' : ['l1', 'l2'],
    'n_iter' : range(5,10),
    'alpha' : np.linspace(0.0001, 0.001, num = 5)
}

In [55]:
parameters_grid = {
    'loss' : ['hinge', 'log', 'squared_hinge', 'squared_loss'],
    'penalty' : ['l1', 'l2'],
    'n_iter' :np.linspace(5, 10, num = 6),
    'alpha' : np.linspace(0.0001, 0.001, num = 5)
    
}

In [62]:
grid_cv = GridSearchCV(classifier, parameters_grid, scoring = 'mean_absolute_error', cv = 4)

In [63]:
grid_cv.fit(X_train, y_train)

GridSearchCV(cv=4, error_score='raise',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=0, shuffle=True, verbose=0,
       warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': array([ 0.0001 ,  0.00032,  0.00055,  0.00078,  0.001  ]), 'penalty': ['l1', 'l2'], 'n_iter': array([  5.,   6.,   7.,   8.,   9.,  10.]), 'loss': ['hinge', 'log', 'squared_hinge', 'squared_loss']},
       pre_dispatch='2*n_jobs', refit=True, scoring='mean_absolute_error',
       verbose=0)

In [64]:
grid_cv.best_estimator_

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='squared_hinge', n_iter=6.0, n_jobs=1,
       penalty='l1', power_t=0.5, random_state=0, shuffle=True, verbose=0,
       warm_start=False)

In [65]:
grid_cv.best_score_

-0.039875000000000001

In [66]:
grid_cv.best_params_

{'alpha': 0.0001, 'loss': 'squared_hinge', 'n_iter': 6.0, 'penalty': 'l1'}

# Gradient Boosting Classifier

In [36]:
boost_model = GradientBoostingClassifier()
boost_model.get_params().keys()

['presort',
 'loss',
 'verbose',
 'subsample',
 'max_leaf_nodes',
 'learning_rate',
 'warm_start',
 'min_samples_leaf',
 'n_estimators',
 'min_samples_split',
 'init',
 'min_weight_fraction_leaf',
 'random_state',
 'max_features',
 'max_depth']

In [37]:
parameters_grid = {
    'n_estimators' : [5, 10, 20, 50, 100, 200, 500],
    'max_depth' : [5, 10, 20]
}
    
grid = GridSearchCV(boost_model, parameters_grid, scoring='mean_absolute_error', cv=4)

In [38]:
grid.fit(X_train, y_train)

GridSearchCV(cv=4, error_score='raise',
       estimator=GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 10, 20, 50, 100, 200, 500], 'max_depth': [5, 10, 20]},
       pre_dispatch='2*n_jobs', refit=True, scoring='mean_absolute_error',
       verbose=0)

In [39]:
print grid.best_score_
print grid.best_params_

-0.017
{'n_estimators': 50, 'max_depth': 5}


In [43]:
y_boost = grid.best_estimator_.predict(X_test)

In [48]:
mean_absolute_error(y_rf, y_boost)

0.0060000000000000001