# Gradient Boosting model results

### Data Preprocessing

In [36]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, learning_curve, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_score, recall_score, roc_auc_score, fbeta_score

In [20]:
data = pd.read_csv("orange_small_churn_train_data.csv")
data_test = pd.read_csv("orange_small_churn_test_data.csv")
data.drop("ID", axis = 1, inplace = True)
data_test.drop("ID", axis = 1, inplace = True)
print data.shape, data_test.shape

(40000, 231) (10000, 230)


In [21]:
num_features = list(data.columns[:190])
cat_features = list(data.columns[190:230])

Delete features with >70% of missing values.

In [5]:
empty_features = []
c = 0.3
for feat in data.columns:
    nulls = data[feat].isnull().value_counts()
    try:
        not_nulls = nulls[False]
        if not_nulls < c*40000:
            empty_features.append(feat)
    except:    
        empty_features.append(feat)
print "number of empty features is", len(empty_features)

number of empty features is 156


In [22]:
for feat in empty_features:
    #data.drop(feat, axis = 1, inplace = True)    
    if feat in num_features:
        num_features.remove(feat)
    else:
        cat_features.remove(feat)
    data.drop(feat,axis=1, inplace=True)
    data_test.drop(feat,axis=1, inplace=True)

In [23]:
print data.shape, data_test.shape

(40000, 75) (10000, 74)


Numeric features: replace missing values with (max value + 1).

In [24]:
maxs = data.max(axis = 0)
for i,feat in enumerate(num_features):
    fill_value = maxs[i] + 1.
    data.fillna({feat: fill_value}, inplace=True)
    data_test.fillna({feat: fill_value}, inplace=True)

Categorial features: replace values with their frequencies. 

In [31]:
data_cat_all = data[cat_features].append(data_test[cat_features])

for feat in data_cat_all.columns:
    data_cat_all[feat] = data_cat_all[feat].map(data_cat_all.groupby(feat).size())
    
data[cat_features] = data_cat_all.loc[:39999,:]
data_test[cat_features] = data_cat_all.loc[39999:,:]

Categorial features: replace missing values with zeros.

In [32]:
data.fillna(0., inplace=True)
data_test.fillna(0., inplace=True)

In [33]:
print data.shape
print data_test.shape
data.head()

(40000, 75)
(10000, 74)


Unnamed: 0,Var6,Var7,Var13,Var21,Var22,Var24,Var25,Var28,Var35,Var38,...,Var220,Var221,Var222,Var223,Var225,Var226,Var227,Var228,Var229,labels
0,3052.0,36.0,97365.0,480.0,600.0,20.0,480.0,200.0,0.0,82752.0,...,3,1662,3,36608.0,0.0,2614,2342,1477,0.0,-1
1,1813.0,7.0,636.0,212.0,265.0,2.0,128.0,166.56,0.0,2706120.0,...,1,37009,1,36608.0,11072.0,2108,35156,4354,9804.0,-1
2,1953.0,7.0,448.0,176.0,220.0,0.0,72.0,311.76,0.0,4698780.0,...,4441,6199,4441,36608.0,0.0,8031,6153,2672,9804.0,-1
3,1533.0,7.0,4.0,332.0,415.0,0.0,144.0,220.08,5.0,864384.0,...,34,37009,34,36608.0,0.0,2108,35156,32703,0.0,1
4,686.0,7.0,0.0,160.0,200.0,2.0,48.0,278.0,0.0,4364880.0,...,2,37009,2,36608.0,0.0,4176,35156,32703,0.0,-1


## Model fitting

### Model k-fold validation 

In [34]:
n_train = 36000
data_train = data[:n_train]

In [37]:
### 0.7380537
#0.745862803746 39000
verbose = False

classifiers = [ GradientBoostingClassifier(n_estimators = 100, max_depth = 3, learning_rate = 0.1,
                                           random_state = 123)]

X = data_train.drop(labels='labels', axis=1)
y = data_train['labels']
#X = data_scaled[:n_train]


skf = StratifiedKFold(n_splits=5, shuffle=True)

for cls in classifiers:
    m1 = []
    m2 = []
    m3 = []
    m4 = []
    n = 0
    
    for train_index, test_index in skf.split(X, y):    
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]    

        cls.fit(X_train, y_train)
        y_pred = cls.predict(X_test)
        y_pred_proba = cls.predict_proba(X_test)[:, 1]
        m1.append( roc_auc_score(y_test, y_pred_proba) )
        m2.append( precision_score(y_test, y_pred) )
        m3.append( recall_score(y_test, y_pred) )
        m4.append( fbeta_score(y_test, y_pred, 2.0) )
        if verbose==True:
            print "Fold", n+1, ":", m1[n], m2[n], m3[n], m4[n]
            n+=1
    print "Total Results:", cls.__class__.__name__
    print "   roc_auc_score:", np.mean(m1)
    print "   precision:    ", np.mean(m2)
    print "   recall:       ", np.mean(m3)
    print "   f-measure:    ", np.mean(m4)
    print "============================================"

Total Results: GradientBoostingClassifier
   roc_auc_score: 0.743719292307
   precision:     0.565217864491
   recall:        0.0250840961448
   f-measure:     0.0310032309825


### Model fitting and submission

Model fitting and validation on test set (10% of samples):

In [38]:
#0.743109072161 - original
#0.744072003283 - mins
#0.747640089559 - gs params 400
cls = GradientBoostingClassifier(n_estimators = 100, max_depth = 3, learning_rate = 0.1, 
                                 random_state = 123)

X = data_train.drop(labels='labels', axis=1)
y = data_train['labels']

cls.fit(X, y)

X = data[n_train:].drop(labels='labels', axis=1)
y_pred = cls.predict(X)
y_pred_proba = cls.predict_proba(X)[:, 1]
y_true = data[n_train:]['labels']

print "Total Results:", cls.__class__.__name__
print "   roc_auc_score:",roc_auc_score(np.array(y_true), y_pred_proba)
print "   precision:    ",precision_score(np.array(y_true), y_pred)
print "   recall:       ",recall_score(np.array(y_true), y_pred)
print "   f-measure:    ",fbeta_score(np.array(y_true), y_pred, 2.0)
print "============================================"

Total Results: GradientBoostingClassifier
   roc_auc_score: 0.739776211765
   precision:     0.461538461538
   recall:        0.022641509434
   f-measure:     0.0279589934762


Model prediction on competition set and submitting: 

In [40]:
X = data_test
y_pred_test = cls.predict(X)
y_pred_test_proba = cls.predict_proba(X)[:, 1]

In [41]:
print y_pred_test_proba.shape
sub = pd.DataFrame()
sub['ID'] = range(10000)
sub['result'] = y_pred_test_proba
sub.to_csv('resultnew.csv', index=False)

(10000,)
