# Lightricks Challenge - DataHack 2018

In [None]:
import os
import sys
import operator
import numpy as np
import pandas as pd
import scipy
import dateutil
from sklearn.model_selection import train_test_split,KFold
# from sklearn.cross_validation import train_test_split, KFold
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn import tree
import seaborn as sns
import matplotlib.pyplot as plt
import csv

In [None]:
%matplotlib inline
import seaborn as sns
sns.set()

In [None]:
path = './'
teamname = 'team_foo'
out_name = path + teamname + '_submission.csv'

In [None]:
df_usage = pd.read_csv(path + "train_usage_data.csv")

In [None]:
#Looking data format and types
df_usage.info()

The variables in dataframe "df_usage" are:

| Field name   | Description | 
|----------|-------------|
| id | User ID |
| feature_name | Name of feature used |
| usage_duration | Duration in seconds between feature was entered and accepted or canceled |
| use_date | Date and time when the feature was entered |
| accepted | True if the user accepted the changes by the feature and False if he did not accepted the changes |

In [None]:
df_usage.head()

In [None]:
df_users = pd.read_csv(path + "train_users_data.csv",parse_dates = [ 'installation_date','subscripiton_date'])

In [None]:
df_users.info()

In [None]:
df_users.head()

The variables in dataframe "df_users" are:

| Field name   | Description | 
|----------|-------------|
| id | User ID |
| installation_date | Date and time when the application was first installed |
| subscription_date | Date and time when the user joined as a subscriber |
| country | Country where the user is based |
| days_until_churned | Days before the user decided to leave the subscription (churn). NaN if the user has not churned |
| churned | False if the user is still a subscriber and True if the user stopped his subscription. |

In [None]:
df_users = df_users.drop(columns='Unnamed: 0')
df_users.head()

# Data exploration:

In [None]:
df_users['churned'].value_counts().plot('bar')

In [None]:
joined_df = df_usage.join(df_users.set_index('id'),on='id')

In [None]:
joined_df.head()

In [None]:
joined_df.usage_duration.to_frame().head()

In [None]:
total_number_of_usage_per_feature = df_usage['feature_name'].value_counts()
total_number_of_usage_per_feature.shape

In [None]:
k = 10
top_k_features_churned = joined_df[joined_df['churned']==True]['feature_name'].value_counts().index[:k]
top_k_features_not_churned = joined_df[joined_df['churned']==False]['feature_name'].value_counts().index[:k]

In [None]:
top_k_features_not_churned

In [None]:
#What are the most popular features, within each class:

#total_number_of_usage_per_feature = df_usage['feature_name'].value_counts()



# k = 10
# top_k_features_churned = joined_df[joined_df['churned']==True]['feature_name'].value_counts().index[:k]
# top_k_features_not_churned = joined_df[joined_df['churned']==False]['feature_name'].value_counts().index[:k]


fig, ax = plt.subplots(2,1, figsize=(17,12))
df_churned = joined_df[joined_df['churned']==True]
p = sns.countplot(data=df_churned[df_churned['feature_name'].isin(top_k_features_churned)], x='feature_name', order = top_k_features_churned, ax=ax[0])

df_not_churned = joined_df[joined_df['churned']==False]
q = sns.countplot(data=df_not_churned[df_not_churned['feature_name'].isin(top_k_features_not_churned)], x='feature_name', order = top_k_features_not_churned, ax=ax[1])

ax[0].set_title('Most used features, churned=1')
ax[1].set_title('Most used features, churned=0')

In [None]:
groupedDf = joined_df.groupby(['id', 'churned'])
groupedDf[['usage_duration']].mean().head()

In [None]:
joined_df.groupby(['id', 'churned'])[['accepted']].mean().boxplot(by='churned')

# Creating our input data:

In [None]:
#Let's create a table with statistic summaries: rows correspond to users; columns to various statistics:
users_usage_summaries = pd.pivot_table(df_usage[['id', 'feature_name']], index=['id'], columns=['feature_name'], aggfunc=len, fill_value=0)

#Let's add the mean of 'accepted' for each user:
accepted_rate = df_usage.groupby(['id'])['accepted'].mean().to_frame()
churned = joined_df.groupby(['id'])['churned'].mean().to_frame()
users_usage_summaries = users_usage_summaries.join(accepted_rate, how='left').join(churned, how='left')

#This is how our df looks like:
users_usage_summaries.head(10)


In [None]:
users_usage_summaries.Darkroom.mean()

# Optimizing

### Importing main functionalities for setup

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

### Creating scaler

In [None]:
scaler = StandardScaler()

## Creating grid search

### parameters grid

In [None]:
#parameters = {'solver': ['lbfgs'], 'max_iter': [500,1000,1500], 'alpha': 10.0 ** -np.arange(1, 7),
#              'hidden_layer_sizes':np.arange(5, 12), 'random_state':[0,1,2,3,4,5,6,7,8,9]}
parameters = {'solver': ['lbfgs'], 'max_iter': [500], 'alpha': 10.0 ** -np.arange(1, 3),
              'hidden_layer_sizes':np.arange(5, 7), 'random_state':[0,1]}
parameters

## Creating grid object

### algorithm instance

In [None]:
from sklearn.neural_network import MLPClassifier
algorithm = MLPClassifier()

### Grid search instance

In [None]:
gridCV = GridSearchCV(algorithm, parameters, n_jobs=-1, cv=5, refit=True)

In [None]:
clf = make_pipeline(scaler, gridCV)

In [None]:
X = users_usage_summaries.iloc[:, users_usage_summaries.columns!='churned'].values
y = users_usage_summaries.loc[:,'churned'].values

# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.05)

In [None]:
clf.fit(X, y)

In [None]:
gridCV.best_params_

In [None]:
gridRes = clf.steps[1]

In [None]:
gridRes

In [None]:
algo=tree.DecisionTreeClassifier(max_features=4)

train = algo.fit(X_train, y_train)
res=train.predict(X_val)

In [None]:
#Let's check the confusion matrix:
cm = metrics.confusion_matrix(y_val, res)
print(cm)
print(classification_report(y_pred=res,y_true=y_val))
# print only f1 score for positive
print(np.round(f1_score(y_pred=res,y_true=y_val),3))

In [None]:
#test your predictor:

#1.Prepare your test-set (in case you created new features/transformed the input data):
df_usage_test = pd.read_csv("test_usage_data.csv")
users_usage_summaries_test = pd.pivot_table(df_usage_test[['id', 'feature_name']], index=['id'], columns=['feature_name'], aggfunc=len, fill_value=0)

accepted_rate_test = df_usage_test.groupby(['id'])['accepted'].mean().to_frame()

#Let's merge the two:
users_usage_summaries_test = users_usage_summaries_test.join(accepted_rate_test, how='left')
X_test = users_usage_summaries_test.values

In [None]:
users_usage_summaries.shape

In [None]:
#submit result:
pred = train.predict(X_test)
df = pd.DataFrame(pred, index=users_usage_summaries_test.index.astype(str), columns=['churned'], dtype=str)
df.to_csv(out_name, header=True, quoting=csv.QUOTE_NONNUMERIC) 