# SPR single model predictions

In [45]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVR

%pylab inline
pylab.rcParams['figure.figsize'] = (10, 6)
color = sns.color_palette()

Populating the interactive namespace from numpy and matplotlib


In [5]:
TARGET_LABELS = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
 'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
 'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
 'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
 'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
 'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
 'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
 'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']

In [6]:
data_path = "data/"

In [13]:
train_month_03 = pd.read_csv(data_path+"train_2015-3.csv", parse_dates=['fecha_dato', 'fecha_alta'])
train_month_04 = pd.read_csv(data_path+"train_2015-4.csv", parse_dates=['fecha_dato', 'fecha_alta'])

Define train dataset and test datasets

In [14]:
train = train_month_03
test = train_month_04

Encode categorical columns

In [15]:
train.dtypes

Unnamed: 0                        int64
fecha_dato               datetime64[ns]
ncodpers                          int64
ind_empleado                     object
pais_residencia                  object
sexo                             object
age                               int64
fecha_alta               datetime64[ns]
ind_nuevo                         int64
antiguedad                        int64
indrel                          float64
ult_fec_cli_1t                   object
indrel_1mes                     float64
tiprel_1mes                      object
indresi                          object
indext                           object
conyuemp                         object
canal_entrada                    object
indfall                          object
nomprov                          object
ind_actividad_cliente           float64
renta                           float64
segmento                         object
ind_ahor_fin_ult1                 int64
ind_aval_fin_ult1                 int64


In [34]:
train.head()

Unnamed: 0.1,Unnamed: 0,fecha_dato,ncodpers,ind_empleado,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,...,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
0,0,2015-03-28,558670,3,34,2,62,2005-09-22,0,118,...,0,0,0,0,0,1,0,0.0,0.0,0
1,1,2015-03-28,558672,3,34,2,41,2005-09-22,0,118,...,0,0,0,0,0,0,0,0.0,0.0,1
2,2,2015-03-28,558662,3,34,2,45,2005-09-22,0,118,...,0,0,0,0,0,0,0,0.0,0.0,0
3,3,2015-03-28,558659,3,34,2,43,2005-09-22,0,118,...,0,0,0,0,0,0,0,0.0,0.0,0
4,4,2015-03-28,558648,3,34,2,71,2005-09-22,0,118,...,0,0,0,0,0,0,0,0.0,0.0,0


In [18]:
string_data = train.select_dtypes(include=["object"])
for c in string_data.columns:
    le = LabelEncoder()    
    le.fit(train[c])
    train[c] = le.transform(train[c])

In [19]:
train.dtypes

Unnamed: 0                        int64
fecha_dato               datetime64[ns]
ncodpers                          int64
ind_empleado                      int64
pais_residencia                   int64
sexo                              int64
age                               int64
fecha_alta               datetime64[ns]
ind_nuevo                         int64
antiguedad                        int64
indrel                          float64
ult_fec_cli_1t                    int64
indrel_1mes                     float64
tiprel_1mes                       int64
indresi                           int64
indext                            int64
conyuemp                          int64
canal_entrada                     int64
indfall                           int64
nomprov                           int64
ind_actividad_cliente           float64
renta                           float64
segmento                          int64
ind_ahor_fin_ult1                 int64
ind_aval_fin_ult1                 int64


In [21]:
Y = np.zeros((train.shape[0], len(TARGET_LABELS)))
for i, col in enumerate(TARGET_LABELS):
    Y[:, i] = train[col]

In [40]:
date_cols = ['fecha_dato', 'fecha_alta']

X = train.drop(date_cols, axis=1)
X = X.drop(['antiguedad'], axis=1)
X['logrenta'] = np.log(X[X['renta'] > 0]['renta'] + 1)
X['logrenta'].fillna(-99, inplace=True)
X['duration'] = (train['fecha_dato'] - train['fecha_alta']).dt.days

In [42]:
X = StandardScaler().fit_transform(X)

In [46]:
rf = RandomForestClassifier(n_estimators = 100, n_jobs = -1, verbose = 0)

In [47]:
X_train = X
y_train = Y[:, 0]
rf.fit(X_train, y_train)
score = rf.score(X_train, y_train)
print score

1.0


In [None]:
rf.fit(X_train, y_train)
score = rf.score(X_train, y_train)
print score
res0 = rf.predict(X_train)
res1 = rf.predict(X_test)
print mape_error(y_train, res0)
print mape_error(y_test, res1)

print res1[:10]
print y_test.values[:10]

In [None]:
def run_cv(data, targets, clf, cv=10):
    X0 = data
    y = targets
    X = StandardScaler().fit_transform(X0)
    scores = cross_val_score(clf, X, y, cv=cv, scoring=make_scorer(mape_error))
    print scores      
    print scores.mean(), scores.std()