# Lightricks Challenge - DataHack 2018

In [None]:
import os
import sys
import operator
import numpy as np
import pandas as pd
import scipy
import dateutil
from sklearn.model_selection import train_test_split,KFold,cross_val_score
# from sklearn.cross_validation import train_test_split, KFold
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn import tree
import seaborn as sns
import matplotlib.pyplot as plt
import csv
from sklearn import svm
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline

In [None]:
%matplotlib inline
import seaborn as sns
sns.set()

In [None]:
path = './'
teamname = 'MadeInJerusalem'
out_name = path + teamname + '_submission.csv'

In [None]:
df_usage = pd.read_csv(path + "train_usage_data.csv")

In [None]:
#Looking data format and types
df_usage.info()

The variables in dataframe "df_usage" are:

| Field name   | Description | 
|----------|-------------|
| id | User ID |
| feature_name | Name of feature used |
| usage_duration | Duration in seconds between feature was entered and accepted or canceled |
| use_date | Date and time when the feature was entered |
| accepted | True if the user accepted the changes by the feature and False if he did not accepted the changes |

In [None]:
df_usage.head()

In [None]:
df_users = pd.read_csv(path + "train_users_data.csv",parse_dates = [ 'installation_date','subscripiton_date'])

In [None]:
df_users.info()

In [None]:
df_users.head()

The variables in dataframe "df_users" are:

| Field name   | Description | 
|----------|-------------|
| id | User ID |
| installation_date | Date and time when the application was first installed |
| subscription_date | Date and time when the user joined as a subscriber |
| country | Country where the user is based |
| days_until_churned | Days before the user decided to leave the subscription (churn). NaN if the user has not churned |
| churned | False if the user is still a subscriber and True if the user stopped his subscription. |

In [None]:
df_users = df_users.drop(columns='Unnamed: 0')
df_users.head()

# Data exploration:

In [None]:
df_users['churned'].value_counts().plot('bar')

In [None]:
joined_df = df_usage.join(df_users.set_index('id'),on='id')

In [None]:
joined_df.head()

In [None]:
joined_df.usage_duration.to_frame().head()

In [None]:
total_number_of_usage_per_feature = df_usage['feature_name'].value_counts()
total_number_of_usage_per_feature.shape

In [None]:
k = 10
top_k_features_churned = joined_df[joined_df['churned']==True]['feature_name'].value_counts().index[:k]
top_k_features_not_churned = joined_df[joined_df['churned']==False]['feature_name'].value_counts().index[:k]

In [None]:
top_k_features_not_churned

In [None]:
df_churned = joined_df[(joined_df['churned']==True) & (joined_df['accepted']==True)]
df_churned.head()

In [None]:
#What are the most popular features, within each class:

#total_number_of_usage_per_feature = df_usage['feature_name'].value_counts()



# k = 10
# top_k_features_churned = joined_df[joined_df['churned']==True]['feature_name'].value_counts().index[:k]
# top_k_features_not_churned = joined_df[joined_df['churned']==False]['feature_name'].value_counts().index[:k]


fig, ax = plt.subplots(2,1, figsize=(17,12))
df_churned = joined_df[(joined_df['churned']==True) & (joined_df['accepted']==True)]
p = sns.countplot(data=df_churned[df_churned['feature_name'].isin(top_k_features_churned)], x='feature_name', order = top_k_features_churned, ax=ax[0])

df_not_churned = joined_df[(joined_df['churned']==False) & (joined_df['accepted']==True)]
q = sns.countplot(data=df_not_churned[df_not_churned['feature_name'].isin(top_k_features_not_churned)], x='feature_name', order = top_k_features_not_churned, ax=ax[1])

ax[0].set_title('Most used features, churned=1')
ax[1].set_title('Most used features, churned=0')

In [None]:
groupedDf = joined_df.groupby(['id', 'churned', 'feature_name'])
groupedDf.mean().head()

In [None]:
joined_df.groupby(['id', 'churned'])[['accepted']].mean().boxplot(by='churned')

# TODO: Filter outliers

In [None]:
#end_of_time_series = np.log(joined_df['usage_duration']).hist()
joined_df = joined_df[joined_df['usage_duration']<200]

# Creating our input data:

In [None]:
app_ver_map = joined_df.groupby(['initial_app_version'])['churned'].mean()
ios_ver_map = joined_df.groupby(['initial_ios_version'])['churned'].mean()
device_map = joined_df.groupby(['initial_device'])['churned'].mean()
country_map = joined_df.groupby(['country'])['churned'].mean()
global_mean = joined_df['churned'].mean()


df_users_test_temp = pd.read_csv(path + "test_users_data.csv",parse_dates = [ 'installation_date','subscripiton_date'])

app_label = preprocessing.LabelEncoder()
app_label.fit(pd.concat([joined_df.initial_app_version, df_users_test_temp.initial_app_version]))

ios_label = preprocessing.LabelEncoder()
ios_label.fit(pd.concat([joined_df.initial_ios_version, df_users_test_temp.initial_ios_version]))

device_label = preprocessing.LabelEncoder()
device_label.fit(joined_df.initial_device)
device_label.fit(pd.concat([joined_df.initial_device, df_users_test_temp.initial_device]))

country_label = preprocessing.LabelEncoder()
country_label.fit(pd.concat([joined_df.country, df_users_test_temp.country]).astype(str))

In [None]:
app_ver = joined_df.groupby(['id'])['initial_app_version'].first().to_frame()
app_ver.head()

In [None]:
app_ver_map['1.0.4']

In [None]:
print(global_mean)
app_ver_map.head()
app_ver = joined_df.groupby(['id'])['initial_app_version'].first().to_frame()
# app_ver = app_ver.apply(lambda x: app_ver_map.initial_app_version[5,1])
app_ver = app_ver.applymap(lambda x: app_ver_map[x] if x in app_ver_map.index else global_mean)
app_ver.head()

In [None]:
# Add time statistics
def create_features(joined_df):
    start_of_time_series = joined_df['subscripiton_date'].min()
    print(start_of_time_series)
    end_of_time_series = joined_df['end_use_date'].max()
    print(end_of_time_series)
    date_features = joined_df[['id','installation_date','subscripiton_date','end_use_date']].groupby('id').max()
    date_features['days_installed'] = (pd.to_datetime(end_of_time_series) - date_features['installation_date']).dt.days
    date_features['days_installed_not_subscribed'] = (date_features['subscripiton_date'] - date_features['installation_date']).dt.days
    date_features['days_since_last_use'] = (pd.to_datetime(end_of_time_series) - pd.to_datetime(date_features['end_use_date'])).dt.days
    date_features['days_used'] = np.minimum((pd.to_datetime(date_features['end_use_date']) - date_features['installation_date']).dt.days, 
                                            (pd.to_datetime(date_features['end_use_date']) - pd.to_datetime(start_of_time_series) ).dt.days)
    date_features = date_features[['days_installed','days_installed_not_subscribed','days_since_last_use','days_used']]
    
    users_mean_usage_time = pd.pivot_table(joined_df[['id', 'feature_name', 'usage_duration']], values='usage_duration', index=['id'], columns=['feature_name'], aggfunc=np.mean, fill_value=0)
    users_mean_usage_time = users_mean_usage_time.add_suffix('_mean_time')
    
    users_mean_acceptance_rate = pd.pivot_table(joined_df[['id', 'feature_name', 'accepted']], values='accepted', index=['id'], columns=['feature_name'], aggfunc=np.mean, fill_value=0)
    users_mean_acceptance_rate = users_mean_acceptance_rate.add_suffix('_mean_acceptance')
    
    users_usage_summaries = pd.pivot_table(joined_df[['id', 'feature_name']], index=['id'], columns=['feature_name'], aggfunc=len, fill_value=0)
    
    accepted_rate = joined_df.groupby(['id'])['accepted'].mean().to_frame()
    
    app_ver = joined_df.groupby(['id'])['initial_app_version'].first().to_frame()
#     app_ver['initial_app_version'] = app_label.transform(app_ver.initial_app_version)
#     app_ver = joined_df.replace({'initial_app_version':app_ver_map}).groupby(['id'])['initial_app_version'].mean().to_frame()
    app_ver = app_ver.applymap(lambda x: app_ver_map[x] if x in app_ver_map.index else global_mean)


    ios_ver = joined_df.groupby(['id'])['initial_ios_version'].first().to_frame()
#     ios_ver['initial_ios_version'] = ios_label.transform(ios_ver.initial_ios_version)
#     ios_ver = joined_df.replace({'initial_ios_version':ios_ver_map}).groupby(['id'])['initial_ios_version'].mean().to_frame()
    ios_ver = ios_ver.applymap(lambda x: ios_ver_map[x] if x in ios_ver_map.index else global_mean)
    
    device = joined_df.groupby(['id'])['initial_device'].first().to_frame()
#     device['initial_device'] = device_label.transform(device.initial_device)
#     device = joined_df.replace({'initial_device':device_map}).groupby(['id'])['initial_device'].mean().to_frame()
    device = device.applymap(lambda x: device_map[x] if x in device_map.index else global_mean)
    
    country = joined_df.groupby(['id'])['country'].first().to_frame()
#     country['country'] = country_label.transform(country.country.astype(str))
#     country = joined_df.replace({'country':country_map}).groupby(['id'])['country'].mean().to_frame()
    country = country.applymap(lambda x: country_map[x] if x in country_map.index else global_mean)
    
    total_features = users_usage_summaries.join(users_mean_usage_time, how='left').join(users_mean_acceptance_rate, how='left').join(date_features, how='left').join(accepted_rate, how='left').join(country, how='left').join(device, how='left').join(ios_ver, how='left').join(app_ver, how='left')
    total_features.fillna(0)
    #This is how our df looks like:
    print(total_features.shape)
    total_features.head()
    return total_features

In [None]:
joined_df.head()

In [None]:
# train data features
total_features = create_features(joined_df)
churned = joined_df.groupby(['id'])['churned'].mean().to_frame()
total_features = total_features.join(churned, how='left')
print(total_features.shape)
total_features.head()

In [None]:
# number_of_churned = total_features[total_features['churned']==True].shape[0]
# churned_samples = total_features[total_features['churned']==True]
# not_churned_samples = total_features[total_features['churned']==False].sample(n=number_of_churned)
# total_features = pd.concat([churned_samples, not_churned_samples]).sample(frac=1)
#This is how our df looks like:
#total_features.shape

# Optimizing

### Importing main functionalities for setup

In [None]:
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE

### Creating scaler

In [None]:
scaler = preprocessing.MinMaxScaler()

## Creating grid search

### parameters grid

In [None]:
#parameters = {'solver': ['lbfgs'], 'max_iter': [500,1000,1500], 'alpha': 10.0 ** -np.arange(1, 7),
#              'hidden_layer_sizes':np.arange(5, 12), 'random_state':[0,1,2,3,4,5,6,7,8,9]}
parameters = {'classification__mlpclassifier__solver': ['sgd'], 'classification__mlpclassifier__max_iter': [1500],
              'classification__mlpclassifier__alpha': 10.0 ** -np.arange(1, 7),
              'classification__mlpclassifier__hidden_layer_sizes':[[30, 30, 30, 30]],
             'classification__mlpclassifier__momentum':[0, 0.3, 0.6, 1]}
parameters

## Creating grid object

### algorithm instance

In [None]:
from sklearn.neural_network import MLPClassifier
algorithm = MLPClassifier()

### creating pipeline

In [None]:
pipedAlgo = make_pipeline(scaler, algorithm)
modelWithOverSampling = Pipeline([
        ('classification', pipedAlgo)
    ])

### Grid search instance

In [None]:
gridCV = GridSearchCV(modelWithOverSampling, parameters, n_jobs=-1, cv=5, refit=True, scoring='f1')

In [None]:
X_train, X_val, y_train, y_val = train_test_split(total_features, total_features, test_size=0.3)
X_val = X_val.iloc[:, X_val.columns!='churned'].values
y_val = y_val.loc[:,'churned'].values

number_of_churned = X_train[X_train['churned']==True].shape[0]
churned_samples = X_train[X_train['churned']==True]
not_churned_samples = X_train[X_train['churned']==False].sample(n=number_of_churned)

train_data = pd.concat([churned_samples, not_churned_samples]).sample(frac=1)
X_train = train_data.iloc[:, train_data.columns!='churned'].values
y_train = train_data.loc[:,'churned'].values

In [None]:
X_val

### Cross validating on the oversampled dataset

In [None]:
gridCV.fit(X_train, y_train)

In [None]:
print(gridCV.best_params_)
print(gridCV.best_score_)

In [None]:
res_val = gridCV.predict_proba(X_val)
res_val = res_val[:,-1]
res_val[res_val>0.45] = 1
res_val[res_val!=1] = 0
print(res_val)

In [None]:
cm = metrics.confusion_matrix(y_val, res_val)
print(cm)
print(classification_report(y_pred=res_val,y_true=y_val))
# print only f1 score for positive
print(np.round(f1_score(y_pred=res_val,y_true=y_val),3))

In [None]:
algo = make_pipeline(preprocessing.MinMaxScaler(), svm.LinearSVC(class_weight='balanced'))
scores = cross_val_score(algo, X, y, cv=5, scoring='f1')
scores.mean()
print(np.round(scores.mean(),3))

In [None]:
from sklearn.ensemble import RandomForestClassifier
algo = make_pipeline(preprocessing.MinMaxScaler(), RandomForestClassifier(max_features=3, n_estimators=1000))
scores = cross_val_score(algo, X_train, y_train, cv=5, scoring='f1')
print(scores.mean())
algo.fit(X_train,y_train)

In [None]:
res_val = algo.predict_proba(X_val)
res_val = res_val[:,-1]
forest_cut_off = 0.48
res_val[res_val>forest_cut_off] = 1
res_val[res_val!=1] = 0
print(res_val)
cm = metrics.confusion_matrix(y_val, res_val)
print(cm)
print(classification_report(y_pred=res_val,y_true=y_val))
print(np.round(f1_score(y_pred=res_val,y_true=y_val),3))

In [None]:
print(X_val)

## got result report

In [None]:
#algo=tree.DecisionTreeClassifier(criterion='entropy',max_depth = 8)
scaler = preprocessing.MinMaxScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
algo  = svm.LinearSVC(class_weight='balanced')
train = algo.fit(X_train_scaled, y_train)
X_val_scaled = scaler.transform(X_val)
res_train=train.predict(X_train_scaled)
res=train.predict(X_val_scaled)
print(res)

In [None]:
#Let's check the confusion matrix:
cm = metrics.confusion_matrix(y_train, res_train)
print(cm)
print(classification_report(y_pred=res_train,y_true=y_train))
# print only f1 score for positive
print(np.round(f1_score(y_pred=res_train,y_true=y_train),3))

In [None]:
#Let's check the confusion matrix:
cm = metrics.confusion_matrix(y_val, res)
print(cm)
print(classification_report(y_pred=res,y_true=y_val))
# print only f1 score for positive
print(np.round(f1_score(y_pred=res,y_true=y_val),3))

In [None]:
df_usage_test = pd.read_csv("test_usage_data.csv")
df_usage_test.head()

In [None]:
#test your predictor:

#1.Prepare your test-set (in case you created new features/transformed the input data):
df_usage_test = pd.read_csv("test_usage_data.csv")
df_users_test = pd.read_csv(path + "test_users_data.csv",parse_dates = [ 'installation_date','subscripiton_date'])
df_users_test = df_users_test.drop(columns='Unnamed: 0')
print(df_usage_test.shape)
print(df_users_test.shape)
joined_df_test = df_usage_test.join(df_users_test.set_index('id'),on='id')
print(joined_df_test.columns)

total_test_features = create_features(joined_df_test)
total_test_features.head()


In [None]:

X_test = total_test_features.values
print(X_test)

In [None]:
X_test.shape

In [None]:
#submit result:
pred = algo.predict_proba(X_test)
pred = pred[:,-1]
pred[pred>forest_cut_off] = 1
pred[pred!=1] = 0
df = pd.DataFrame(pred, index=total_test_features.index.astype(str), columns=['churned'], dtype=str)
df.to_csv(out_name, header=True, quoting=csv.QUOTE_NONNUMERIC) 
print(df.shape)
df.head()