# Setup

In [2]:
import collections

import pandas as pd
import numpy as np

import sklearn 

import sklearn.datasets

import decimal

import sklearn.model_selection
import sklearn.dummy
import sklearn.linear_model
import sklearn.ensemble
import sklearn.neural_network

import xgboost as xgb

from secrets import KAGGLE_USER, KAGGLE_PW

# Regression

## Get Data

In [3]:
competition_name = 'allstate-claims-severity'

In [4]:
# %mkdir  -p data/$competition_name
# %cd data/$competition_name
# !kg config -g -u $KAGGLE_USER -p $KAGGLE_PW -c $competition_name
# !kg download
# !unzip '*.zip'
# %cd ../..

## Preprocessing

In [5]:
competition_name = 'allstate-claims-severity'
data = pd.read_csv('data/'+competition_name+'/train.csv')
data = data.sample(1000)

In [6]:
col_id = 'id'
target = 'loss'
features = list(data.columns.values)
features.remove(col_id)
features.remove(target)
features_cat = [cat for cat in features if 'cat' in cat]
features_cont = [cont for cont in features if 'cont' in cont]

In [7]:
df_cat = data[features_cat]
df_cat_dummies = pd.get_dummies(df_cat)
df_cont = data[features_cont]

X = np.hstack([df_cat_dummies, df_cont])
y = data[target]

## Modeling

In [10]:
models = [
    sklearn.dummy.DummyRegressor(),
    sklearn.linear_model.LinearRegression(),
    sklearn.svm.SVR(),
    sklearn.ensemble.RandomForestRegressor(),
    sklearn.ensemble.ExtraTreesRegressor(),
    xgb.XGBRegressor(),
]

In [11]:
scorings = [
    ('neg_mean_squared_error', sklearn.metrics.mean_squared_error),
]
scoring, scorer = scorings[0]

In [12]:
for mdl in models:
    'model', mdl.fit(X, y)
    'train', scorer(y, mdl.predict(X))
    cvss = sklearn.model_selection.cross_val_score(mdl, X, y)
    'cv', ['%.2E' % decimal.Decimal(cvs) for cvs in cvss]

('model', DummyRegressor(constant=None, quantile=None, strategy='mean'))

('train', 8157848.9047869602)

('cv', ['-1.05E-03', '-4.34E-03', '-2.34E-03'])

('model',
 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))

('train', 1673549.4681819999)

('cv', ['-1.13E+24', '-9.45E+22', '-1.98E+23'])

('model',
 SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
   kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False))

('train', 9070613.9290367048)

('cv', ['-1.41E-01', '-1.02E-01', '-1.00E-01'])

('model',
 RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
            max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))

('train', 929854.72843959404)

('cv', ['3.86E-01', '4.56E-01', '3.79E-01'])

('model', ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False))

('train', 3.0176446406535717e-25)

('cv', ['-4.84E-02', '3.68E-01', '-9.72E-02'])

('model',
 XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
        learning_rate=0.1, max_delta_step=0, max_depth=3,
        min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
        objective='reg:linear', reg_alpha=0, reg_lambda=1,
        scale_pos_weight=1, seed=0, silent=True, subsample=1))

('train', 1498878.881841114)

('cv', ['4.15E-01', '5.49E-01', '4.05E-01'])

# Classification - Binary

## Get Data

In [9]:
competition_name = 'titanic'

In [10]:
# %mkdir  -p data/$competition_name
# %cd data/$competition_name
# !kg config -g -u $KAGGLE_USER -p $KAGGLE_PW -c $competition_name
# !kg download
# !unzip '*.zip'
# %cd ..

## Preprocessing

In [None]:
data = pd.read_csv('data/'+competition_name+'/train.csv')

fillna_value = {
    'Age': data['Age'].mean(),
    'Embarked': 'unknown'
}
data = data.fillna(value=fillna_value)

col_id = 'PassengerId'
target = 'Survived'
features_num = [
    'Pclass',
    'Age',
    'SibSp',
    'Parch',
    'Fare',
]
features_cat = ['Sex', 'Embarked']

df_cat = data[features_cat]
df_cat_dummies = pd.get_dummies(df_cat)
df_num = data[features_num]

X = np.hstack([df_cat_dummies, df_num])
y = data[target]

## Modeling

In [128]:
models = [
    sklearn.dummy.DummyClassifier(),
    sklearn.linear_model.LogisticRegression(),
    sklearn.svm.SVC(),
    sklearn.ensemble.RandomForestClassifier(),
    sklearn.ensemble.ExtraTreesClassifier(),
    xgboost.XGBClassifier(),
]

In [149]:
scorings = [
        ('roc_auc', sklearn.metrics.roc_auc_score),
        ('?', sklearn.metrics.average_precision_score)
    ]    
scoring, scorer = scorings[0]

In [163]:
for mdl in models:
    'model', mdl.fit(X, y)
    'train', scorer(y, mdl.predict(X))
    'cv', sklearn.model_selection.cross_val_score(mdl, X, y, scoring=scoring)

('model',
 DummyClassifier(constant=None, random_state=None, strategy='stratified'))

('train', 0.52281660435241106)

('cv', array([ 0.44643371,  0.46448087,  0.49626114]))

('model',
 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False))

('train', 0.78439267567826665)

('cv', array([ 0.82966638,  0.85310613,  0.85691688]))

('model', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
   max_iter=-1, probability=False, random_state=None, shrinking=True,
   tol=0.001, verbose=False))

('train', 0.87585082925893964)

('cv', array([ 0.70676349,  0.76677691,  0.82830026]))

('model',
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
             verbose=0, warm_start=False))

('train', 0.97370817754769434)

('cv', array([ 0.8173713 ,  0.86477807,  0.84553255]))

('model',
 ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))

('train', 0.97771067011791779)

('cv', array([ 0.78532739,  0.82976225,  0.80155306]))

('model',
 XGBClassifier(base_score=0.5, colsample_bytree=1, gamma=0, learning_rate=0.1,
        max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=100,
        nthread=-1, objective='binary:logistic', seed=0, silent=True,
        subsample=1))

('train', 0.85499952065957241)

('cv', array([ 0.82120602,  0.86988304,  0.87875084]))

In [157]:
for mdl in models:
    mdl.fit(X, y)
    scorer(y, mdl.predict(X))
    sklearn.model_selection.cross_val_score(mdl, X, y, scoring=scoring)

DummyClassifier(constant=None, random_state=None, strategy='stratified')

0.49827437446074208

array([ 0.49575784,  0.49403221,  0.46836353])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

0.78439267567826665

array([ 0.82966638,  0.85310613,  0.85691688])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

0.87585082925893964

array([ 0.70676349,  0.76677691,  0.82830026])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

0.95635605406960023

array([ 0.80689771,  0.86818138,  0.83721599])

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

0.97771067011791779

array([ 0.78427284,  0.82571182,  0.80874317])

XGBClassifier(base_score=0.5, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=100,
       nthread=-1, objective='binary:logistic', seed=0, silent=True,
       subsample=1)

0.85499952065957241

array([ 0.82120602,  0.86988304,  0.87875084])

# Classification - Multiple

## Preprocessing

In [90]:
data = sklearn.datasets.load_iris()
X = data.data
y = data.target

## Modeling

In [92]:
models = [
    sklearn.dummy.DummyClassifier(),
    sklearn.linear_model.LogisticRegression(),
    sklearn.svm.SVC(),
    sklearn.ensemble.RandomForestClassifier(),
    sklearn.ensemble.ExtraTreesClassifier(),
    xgboost.XGBClassifier(),
]

In [93]:
scorings = [
    'log_loss',
]
scoring = scorings[0]

In [94]:
mdl = models[0]
scoring = scorings[0]

In [96]:
for mdl in models:
    print mdl
    cvss = sklearn.model_selection.cross_val_score(mdl, X, y)
    ['%.2E' % decimal.Decimal(cvs) for cvs in cvss]

DummyClassifier(constant=None, random_state=None, strategy='stratified')


['3.53E-01', '3.33E-01', '3.54E-01']

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


['9.61E-01', '9.22E-01', '9.58E-01']

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


['9.80E-01', '9.61E-01', '9.79E-01']

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)


['9.80E-01', '9.02E-01', '9.58E-01']

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)


['9.80E-01', '8.82E-01', '9.58E-01']

XGBClassifier(base_score=0.5, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=100,
       nthread=-1, objective='binary:logistic', seed=0, silent=True,
       subsample=1)


['9.80E-01', '9.41E-01', '9.79E-01']

# References
[Scoring v.s. metrics](http://scikit-learn.org/stable/modules/model_evaluation.html#common-cases-predefined-values)