In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_log_error, mean_squared_error, log_loss

from sklearn.model_selection import GridSearchCV

In [55]:
data = pd.read_csv('data/churn.csv')
data.columns

Index(['avg_dist', 'avg_rating_by_driver', 'avg_rating_of_driver', 'avg_surge',
       'city', 'last_trip_date', 'phone', 'signup_date', 'surge_pct',
       'trips_in_first_30_days', 'luxury_car_user', 'weekday_pct'],
      dtype='object')

In [56]:
data.head().T

Unnamed: 0,0,1,2,3,4
avg_dist,6.94,8.06,21.5,9.46,13.77
avg_rating_by_driver,5,5,4,5,5
avg_rating_of_driver,5,5,,,
avg_surge,1,1,1,2.75,1
city,Astapor,Astapor,Winterfell,Winterfell,Winterfell
last_trip_date,2014-05-03,2014-01-26,2014-05-21,2014-01-10,2014-05-13
phone,Android,Android,iPhone,Android,iPhone
signup_date,2014-01-12,2014-01-25,2014-01-02,2014-01-09,2014-01-31
surge_pct,0,0,0,100,0
trips_in_first_30_days,0,2,1,1,0


In [57]:
data.describe()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,surge_pct,trips_in_first_30_days,weekday_pct
count,40000.0,39838.0,33472.0,40000.0,40000.0,40000.0,40000.0
mean,5.791302,4.777434,4.601697,1.074956,8.857342,2.2807,60.874382
std,5.708056,0.448088,0.61481,0.222427,20.014008,3.811289,37.089619
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,2.42,4.7,4.3,1.0,0.0,0.0,33.3
50%,3.88,5.0,4.9,1.0,0.0,1.0,66.7
75%,6.93,5.0,5.0,1.05,8.3,3.0,100.0
max,160.96,5.0,5.0,8.0,100.0,125.0,100.0


In [58]:
data.isnull().sum()

avg_dist                     0
avg_rating_by_driver       162
avg_rating_of_driver      6528
avg_surge                    0
city                         0
last_trip_date               0
phone                      319
signup_date                  0
surge_pct                    0
trips_in_first_30_days       0
luxury_car_user              0
weekday_pct                  0
dtype: int64

In [59]:
data.apply(pd.Series.nunique)

avg_dist                  2764
avg_rating_by_driver        27
avg_rating_of_driver        37
avg_surge                  111
city                         3
last_trip_date             182
phone                        2
signup_date                 31
surge_pct                  357
trips_in_first_30_days      57
luxury_car_user              2
weekday_pct                641
dtype: int64

In [60]:
import math
def isNA(input):
    if pd.isnull(input):
        return 1
    else:
        return 0
NA_columns = ['avg_rating_of_driver', 'avg_rating_by_driver', 'phone']

for column in NA_columns:
    data['{}_isNA'.format(column)] = data[column].apply(isNA)


In [61]:
#fill all NAs with values that are different from other values in the same feature to 
#differentiate

def fillNA(data, column_name, fill):
    data[column_name] = data[column_name].fillna(fill)

fillNA(data, 'avg_rating_by_driver', 0)
fillNA(data, 'avg_rating_of_driver', 0)
fillNA(data, 'phone', 'Unspecified')

In [62]:
data.describe()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,surge_pct,trips_in_first_30_days,weekday_pct,avg_rating_of_driver_isNA,avg_rating_by_driver_isNA,phone_isNA
count,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0
mean,5.791302,4.758085,3.8507,1.074956,8.857342,2.2807,60.874382,0.1632,0.00405,0.007975
std,5.708056,0.540402,1.791155,0.222427,20.014008,3.811289,37.089619,0.369553,0.063511,0.088947
min,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.42,4.7,4.0,1.0,0.0,0.0,33.3,0.0,0.0,0.0
50%,3.88,5.0,4.7,1.0,0.0,1.0,66.7,0.0,0.0,0.0
75%,6.93,5.0,5.0,1.05,8.3,3.0,100.0,0.0,0.0,0.0
max,160.96,5.0,5.0,8.0,100.0,125.0,100.0,1.0,1.0,1.0


In [63]:
def find_unique(data, column):
    return column, data[column].unique()
    
categories = ['city', 'phone', 'luxury_car_user']
result = []
for column in categories:
    result.append(find_unique(data, column))
    
pd.DataFrame(result)

Unnamed: 0,0,1
0,city,"[Astapor, Winterfell, King's Landing]"
1,phone,"[Android, iPhone, Unspecified]"
2,luxury_car_user,"[False, True]"


In [64]:
#change categorical data into dummy variables, need to define a function so that 
#when new data comes into the pipeline, it can handle
def make_dummies(test_col, train_unique_vals, col_name):
    """
    Return a df containing len(train_unique_vals) columns for 
    each unique value in train_unique_vals. If the test_col has more 
    unique values that are not seen in train_unique_vals, value
    will be 0
    """
    dummies = {}
    for val in train_unique_vals:
        dummies[col_name + '_' + val] = (test_col == val).astype(int)
    return pd.DataFrame(dummies, index = test_col.index)

In [65]:
categories = ['city', 'phone']
data_transformed = data
for category in categories:
    temp_df = make_dummies(data[category], data[category].unique(), category)
    data_transformed = pd.concat([data, temp_df], axis = 1)
    data = data_transformed

data_transformed.head()


Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,city,last_trip_date,phone,signup_date,surge_pct,trips_in_first_30_days,...,weekday_pct,avg_rating_of_driver_isNA,avg_rating_by_driver_isNA,phone_isNA,city_Astapor,city_Winterfell,city_King's Landing,phone_Android,phone_iPhone,phone_Unspecified
0,6.94,5.0,5.0,1.0,Astapor,2014-05-03,Android,2014-01-12,0.0,0,...,100.0,0,0,0,1,0,0,1,0,0
1,8.06,5.0,5.0,1.0,Astapor,2014-01-26,Android,2014-01-25,0.0,2,...,0.0,0,0,0,1,0,0,1,0,0
2,21.5,4.0,0.0,1.0,Winterfell,2014-05-21,iPhone,2014-01-02,0.0,1,...,100.0,1,0,0,0,1,0,0,1,0
3,9.46,5.0,0.0,2.75,Winterfell,2014-01-10,Android,2014-01-09,100.0,1,...,100.0,1,0,0,0,1,0,1,0,0
4,13.77,5.0,0.0,1.0,Winterfell,2014-05-13,iPhone,2014-01-31,0.0,0,...,100.0,1,0,0,0,1,0,0,1,0


In [66]:
data_transformed['last_trip_date'] = pd.to_datetime(data['last_trip_date'])
data_transformed['signup_date'] = pd.to_datetime(data['signup_date'])
data_transformed['isActive'] = data_transformed['last_trip_date'] > pd.to_datetime('20140601')

data_transformed.head()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,city,last_trip_date,phone,signup_date,surge_pct,trips_in_first_30_days,...,avg_rating_of_driver_isNA,avg_rating_by_driver_isNA,phone_isNA,city_Astapor,city_Winterfell,city_King's Landing,phone_Android,phone_iPhone,phone_Unspecified,isActive
0,6.94,5.0,5.0,1.0,Astapor,2014-05-03,Android,2014-01-12,0.0,0,...,0,0,0,1,0,0,1,0,0,False
1,8.06,5.0,5.0,1.0,Astapor,2014-01-26,Android,2014-01-25,0.0,2,...,0,0,0,1,0,0,1,0,0,False
2,21.5,4.0,0.0,1.0,Winterfell,2014-05-21,iPhone,2014-01-02,0.0,1,...,1,0,0,0,1,0,0,1,0,False
3,9.46,5.0,0.0,2.75,Winterfell,2014-01-10,Android,2014-01-09,100.0,1,...,1,0,0,0,1,0,1,0,0,False
4,13.77,5.0,0.0,1.0,Winterfell,2014-05-13,iPhone,2014-01-31,0.0,0,...,1,0,0,0,1,0,0,1,0,False


In [67]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 22 columns):
avg_dist                     40000 non-null float64
avg_rating_by_driver         40000 non-null float64
avg_rating_of_driver         40000 non-null float64
avg_surge                    40000 non-null float64
city                         40000 non-null object
last_trip_date               40000 non-null datetime64[ns]
phone                        40000 non-null object
signup_date                  40000 non-null datetime64[ns]
surge_pct                    40000 non-null float64
trips_in_first_30_days       40000 non-null int64
luxury_car_user              40000 non-null bool
weekday_pct                  40000 non-null float64
avg_rating_of_driver_isNA    40000 non-null int64
avg_rating_by_driver_isNA    40000 non-null int64
phone_isNA                   40000 non-null int64
city_Astapor                 40000 non-null int64
city_Winterfell              40000 non-null int64
city_King'

In [68]:
#define a function that can be re-used to test different models
def get_model_error(model, X, y, test_size):
    trainX, testX, trainY, testY = train_test_split(X, y, test_size = test_size)
    model.fit(trainX, trainY)
    pred = model.predict_proba(testX)
    predX = model.predict_proba(trainX)

    return log_loss(trainY, predX), log_loss(testY, pred)

In [69]:
data_transformed.head()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,city,last_trip_date,phone,signup_date,surge_pct,trips_in_first_30_days,...,avg_rating_of_driver_isNA,avg_rating_by_driver_isNA,phone_isNA,city_Astapor,city_Winterfell,city_King's Landing,phone_Android,phone_iPhone,phone_Unspecified,isActive
0,6.94,5.0,5.0,1.0,Astapor,2014-05-03,Android,2014-01-12,0.0,0,...,0,0,0,1,0,0,1,0,0,False
1,8.06,5.0,5.0,1.0,Astapor,2014-01-26,Android,2014-01-25,0.0,2,...,0,0,0,1,0,0,1,0,0,False
2,21.5,4.0,0.0,1.0,Winterfell,2014-05-21,iPhone,2014-01-02,0.0,1,...,1,0,0,0,1,0,0,1,0,False
3,9.46,5.0,0.0,2.75,Winterfell,2014-01-10,Android,2014-01-09,100.0,1,...,1,0,0,0,1,0,1,0,0,False
4,13.77,5.0,0.0,1.0,Winterfell,2014-05-13,iPhone,2014-01-31,0.0,0,...,1,0,0,0,1,0,0,1,0,False


In [70]:
data2 = data_transformed.drop(['signup_date', 'last_trip_date', 'city', 'phone'], axis = 1)
data2.head()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,surge_pct,trips_in_first_30_days,luxury_car_user,weekday_pct,avg_rating_of_driver_isNA,avg_rating_by_driver_isNA,phone_isNA,city_Astapor,city_Winterfell,city_King's Landing,phone_Android,phone_iPhone,phone_Unspecified,isActive
0,6.94,5.0,5.0,1.0,0.0,0,False,100.0,0,0,0,1,0,0,1,0,0,False
1,8.06,5.0,5.0,1.0,0.0,2,True,0.0,0,0,0,1,0,0,1,0,0,False
2,21.5,4.0,0.0,1.0,0.0,1,True,100.0,1,0,0,0,1,0,0,1,0,False
3,9.46,5.0,0.0,2.75,100.0,1,False,100.0,1,0,0,0,1,0,1,0,0,False
4,13.77,5.0,0.0,1.0,0.0,0,False,100.0,1,0,0,0,1,0,0,1,0,False


In [71]:
X = data2.values[:, :-1]
y = data2['isActive']

In [72]:
#getting a benchmark
model_rf = RandomForestClassifier(n_estimators=100, max_depth=2,
                         random_state=0)
get_model_error(model_rf, X, y, 0.1)

(0.5547867401231119, 0.5548568510113987)