<h1>imports</h1>

In [22]:
import pandas as pd
pd.set_option('display.max_columns', 500)
from urllib import request
import json
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder, Normalizer, Binarizer, 
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

<h1>load data</h1>

In [23]:
data_path = './data/'

In [24]:
train_accounts = pd.read_csv(data_path + 'train_accounts.csv')
# train_users = pd.read_csv(data_path + 'train_users.csv')
# train_events = pd.read_csv(data_path + 'train_events.csv')
# train_subscriptions = pd.read_csv(data_path + 'train_subscriptions.csv')
test_accounts = pd.read_csv(data_path + 'test_accounts.csv')
# test_users = pd.read_csv(data_path + 'test_users.csv')
# test_events = pd.read_csv(data_path + 'test_events.csv')
# test_subscriptions = pd.read_csv(data_path + 'test_subscriptions.csv')

<h1>feature engineering</h1>

In [25]:
def remove_outlier(df_in, col_name):
    q1 = df_in[col_name].quantile(0.25)
    q3 = df_in[col_name].quantile(0.75)
    iqr = q3-q1 #Interquartile range
    fence_low  = q1-1.5*iqr
    fence_high = q3+1.5*iqr
    df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
    return df_out

In [26]:
# transform plan_id & utm_cluster_id to str since its categorical
train_accounts['plan_id'] = train_accounts['plan_id'].astype(str)
train_accounts['utm_cluster_id'] = train_accounts['utm_cluster_id'].astype(str)
test_accounts['plan_id'] = test_accounts['plan_id'].astype(str)
test_accounts['utm_cluster_id'] = test_accounts['utm_cluster_id'].astype(str)

In [27]:
# creating time features
time_between_created_trial = pd.to_datetime(train_accounts['trial_start']) - pd.to_datetime(train_accounts['created_at'])
time_between_created_subscription = pd.to_datetime(train_accounts['subscription_started_at']) - pd.to_datetime(train_accounts['created_at'])
time_between_trial_subscription = pd.to_datetime(train_accounts['subscription_started_at']) - pd.to_datetime(train_accounts['trial_start'])
time_between_now_trial = datetime.now() - pd.to_datetime(train_accounts['trial_start'])
time_between_now_subscription = datetime.now() - pd.to_datetime(train_accounts['subscription_started_at'])
time_between_now_created = datetime.now() - pd.to_datetime(train_accounts['created_at'])
train_accounts = train_accounts.assign(created_trial_delta=time_between_created_trial.apply(lambda x: (x.seconds//3600)))
train_accounts = train_accounts.assign(created_subscription_delta=time_between_created_subscription.apply(lambda x: (x.seconds//3600)))
train_accounts = train_accounts.assign(trial_subscription_delta=time_between_trial_subscription.apply(lambda x: (x.seconds//3600)))
train_accounts = train_accounts.assign(now_trial_delta=time_between_now_trial.apply(lambda x: (x.seconds//3600)))
train_accounts = train_accounts.assign(now_subscription_delta=time_between_now_subscription.apply(lambda x: (x.seconds//3600)))
train_accounts = train_accounts.assign(now_created_delta=time_between_now_created.apply(lambda x: (x.seconds//3600)))
train_accounts['is_subscription'] = (train_accounts.subscription_started_at.isna()).astype(int)

time_between_created_trial = pd.to_datetime(test_accounts['trial_start']) - pd.to_datetime(test_accounts['created_at'])
time_between_created_subscription = pd.to_datetime(test_accounts['subscription_started_at']) - pd.to_datetime(test_accounts['created_at'])
time_between_trial_subscription = pd.to_datetime(test_accounts['subscription_started_at']) - pd.to_datetime(test_accounts['trial_start'])
time_between_now_trial = datetime.now() - pd.to_datetime(test_accounts['trial_start'])
time_between_now_subscription = datetime.now() - pd.to_datetime(test_accounts['subscription_started_at'])
time_between_now_created = datetime.now() - pd.to_datetime(test_accounts['created_at'])
test_accounts = test_accounts.assign(created_trial_delta=time_between_created_trial.apply(lambda x: (x.seconds//3600)))
test_accounts = test_accounts.assign(created_subscription_delta=time_between_created_subscription.apply(lambda x: (x.seconds//3600)))
test_accounts = test_accounts.assign(trial_subscription_delta=time_between_trial_subscription.apply(lambda x: (x.seconds//3600)))
test_accounts = test_accounts.assign(now_trial_delta=time_between_now_trial.apply(lambda x: (x.seconds//3600)))
test_accounts = test_accounts.assign(now_subscription_delta=time_between_now_subscription.apply(lambda x: (x.seconds//3600)))
test_accounts = test_accounts.assign(now_created_delta=time_between_now_created.apply(lambda x: (x.seconds//3600)))
test_accounts['is_subscription'] = (test_accounts.subscription_started_at.isna()).astype(int)

In [28]:
# creating size & survey features

train_accounts = remove_outlier(train_accounts,'company_size')
train_accounts.loc[:,'avg_team_size'] = train_accounts[["min_team_size", "max_team_size"]].mean(axis=1)
train_accounts['avg_team_size'].fillna(-1, inplace=True)
train_accounts['survey_answers'] = train_accounts[['company_size','max_team_size','min_team_size','user_goal','user_description','team_size']].isna().sum(axis=1)
train_accounts['survey_did_answer'] = train_accounts['survey_answers']

test_accounts = remove_outlier(test_accounts,'company_size')
test_accounts.loc[:,'avg_team_size'] = test_accounts[["min_team_size", "max_team_size"]].mean(axis=1)
test_accounts['avg_team_size'].fillna(-1, inplace=True)
test_accounts['survey_answers'] = test_accounts[['company_size','max_team_size','min_team_size','user_goal','user_description','team_size']].isna().sum(axis=1)
test_accounts['survey_did_answer'] = test_accounts['survey_answers']

<h1>preprocessing</h1>

In [None]:
bins = sorted((list(train_accounts["max_team_size"].value_counts().index) + [-1.1, -1, ]))
bins_labels = [str(b) for b in bins[1:]]
train_accounts['avg_team_cat'] = pd.cut(train_accounts['avg_team_size'], bins=bins, labels=bins_labels)
bins = sorted((list(test_accounts["max_team_size"].value_counts().index) + [-1.1, -1, ]))
bins_labels = [str(b) for b in bins[1:]]
test_accounts['avg_team_cat'] = pd.cut(test_accounts['avg_team_size'], bins=bins, labels=bins_labels)

train_accounts['avg_team_cat'] = train_accounts['avg_team_cat'].astype(str)
test_accounts['avg_team_cat'] = test_accounts['avg_team_cat'].astype(str)

In [29]:
# We map our features into different types
categorical_features = ['os', 'browser', 'payment_currency', 'device', 'country', 'industry', 'utm_cluster_id',
                        'plan_id', 'avg_team_cat']
normalized_features = ['collection_21_days', 'mrr', 'created_trial_delta', 'created_subscription_delta',
                       'trial_subscription_delta', 'now_trial_delta', 'now_subscription_delta', 'now_created_delta',
                       'company_size', 'survey_answers']
binary_features = ['survey_did_answer']
untouched_features = ['paying', 'is_subscription']
KBinsDiscretized_features = []
target = ['lead_score']

# And create a column transformer to handle the manipulation for us
preprocess = make_column_transformer(
    (OneHotEncoder(), categorical_features),
    (Normalizer(), normalized_features),
    (Binarizer(), binary_features),
    (, KBinsDiscretized_features)
)

In [30]:
train_accounts.set_index('account_id', inplace=True)
test_accounts.set_index('account_id', inplace=True)

# Getting only the relevant features from the dataset
dataset_train = train_accounts[categorical_features + normalized_features + binary_features + untouched_features + target]
dataset_test = test_accounts[categorical_features + normalized_features + binary_features + untouched_features]

# Filling empty values with default values 
def fill_empty_values(dataset):
    dataset.loc[:,categorical_features] = dataset[categorical_features].fillna('')
    dataset.loc[:,normalized_features + binary_features + untouched_features] = dataset[normalized_features + binary_features + untouched_features].fillna(0)
    return dataset

dataset_train = fill_empty_values(dataset_train)
dataset_test = fill_empty_values(dataset_test)

ValueError: fill value must be in categories

In [8]:
# Seperating the label
y = dataset_train.pop('lead_score')
# We fit our column transformer on both the train and the test sets
concatenated = pd.concat([dataset_train, dataset_test])
preprocess.fit(concatenated)

# We use transform to finally manipulate the features of our training set
dataset_train = dataset_train[concatenated.columns]
x = preprocess.transform(dataset_train)

<h1>train model</h1>

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=42)
model = LogisticRegression(class_weight='balanced', penalty='l2') # 'penalty': ['l1', 'l2'], 'C': [1, 10, 100, 1000]
model.fit(x_train,y_train)
y_pred = model.predict(x_test)



In [27]:
print(classification_report(y_test, y_pred, target_names=['not lead','lead']))
print('Acc:  {}'.format(metrics.accuracy_score(y_test, y_pred)))
print('MCC: {}'.format(metrics.matthews_corrcoef(y_test, y_pred)))
print('F1:  {}'.format(metrics.f1_score(y_test, y_pred)))


              precision    recall  f1-score   support

    not lead       0.99      0.71      0.83     66427
        lead       0.07      0.81      0.12      1672

    accuracy                           0.71     68099
   macro avg       0.53      0.76      0.47     68099
weighted avg       0.97      0.71      0.81     68099

Acc:  0.7094964683769218
MCC: 0.17473807696080024
F1:  0.12087277251921966


<h1>submit</h1>

In [19]:
dataset_test = dataset_test[concatenated.columns]
x_submission = preprocess.transform(dataset_test)
y_pred_submission = model.predict(x_submission)
# Creating a dictionary where the keys are the account_ids
# and the values are your predictions
submission_account_ids = test_accounts.index
predictions = dict(zip(submission_account_ids, map(int, y_pred_submission)))

In [20]:
group_name = 'fRidaY'

In [21]:
# We validate first that we actually send all the test accounts expected to be sent
if y_pred_submission.shape[0] != 71683 or submission_account_ids.shape[0] != 71683:
  raise Exception("You have to send all of the accounts! Expected: (71683, 71683), Got: ({}, {})".format(y_pred_submission.shape[0], submission_account_ids.shape[0]))

if "group_name" not in vars() or group_name == "":
  group_name = input("Please enter your group's name:")

data = json.dumps({'submitter': group_name, 'predictions': predictions}).encode('utf-8')

req = request.Request("https://leaderboard.datahack.org.il/monday/api/",
                      headers={'Content-Type': 'application/json'},
                      data=data)

res = request.urlopen(req)
print(json.load(res))

HTTPError: HTTP Error 500: INTERNAL SERVER ERROR

In [16]:
set(predictions.values())

{0, 1}