In [7]:
# import section

import pandas as pd
pd.set_option('display.max_columns', 500)
from urllib import request
import json
from sklearn.preprocessing import OneHotEncoder, Normalizer, Binarizer
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [8]:
data_path = './data/'

In [9]:
train_accounts = pd.read_csv(data_path + 'train_accounts.csv')
train_users = pd.read_csv(data_path + 'train_users.csv')
train_events = pd.read_csv(data_path + 'train_events.csv')
train_subscriptions = pd.read_csv(data_path + 'train_subscriptions.csv')
test_accounts = pd.read_csv(data_path + 'test_accounts.csv')
test_users = pd.read_csv(data_path + 'test_users.csv')
test_events = pd.read_csv(data_path + 'test_events.csv')
test_subscriptions = pd.read_csv(data_path + 'test_subscriptions.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [41]:
# We map our features into different types
categorical_features = ['os']
normalized_features = ['collection_21_days']
binary_features = ['plan_id']
untouched_features = ['paying']
target = ['lead_score']

# And create a column transformer to handle the manipulation for us
preprocess = make_column_transformer(
    (OneHotEncoder(), categorical_features),
    (Normalizer(), normalized_features),
    (Binarizer(), binary_features)
)

In [60]:
train_accounts.set_index('account_id', inplace=True)
test_accounts.set_index('account_id', inplace=True)

# Getting only the relevant features from the dataset
dataset_train = train_accounts[categorical_features + normalized_features + binary_features + untouched_features + target]
dataset_test = test_accounts[categorical_features + normalized_features + binary_features + untouched_features]

# Filling empty values with default values 
def fill_empty_values(dataset):
    dataset.loc[:,categorical_features] = dataset[categorical_features].fillna('')
    dataset.loc[:,normalized_features + binary_features + untouched_features] = dataset[normalized_features + binary_features + untouched_features].fillna(0)
    return dataset

dataset_train = fill_empty_values(dataset_train)
dataset_test = fill_empty_values(dataset_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [61]:
# Seperating the label
y = dataset_train.pop('lead_score')
# We fit our column transformer on both the train and the test sets
concatenated = pd.concat([dataset_train, dataset_test])
preprocess.fit(concatenated)

# We use transform to finally manipulate the features of our training set
dataset_train = dataset_train[concatenated.columns]
x = preprocess.transform(dataset_train)

In [62]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=42)
model = LogisticRegression()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)



In [63]:
print(classification_report(y_test, y_pred, target_names=['not lead','lead']))
print('Acc:  {}'.format(metrics.accuracy_score(y_test, y_pred)))
print('MCC: {}'.format(metrics.matthews_corrcoef(y_test, y_pred)))
print('F1:  {}'.format(metrics.f1_score(y_test, y_pred)))

              precision    recall  f1-score   support

    not lead       0.98      1.00      0.99     66427
        lead       0.00      0.00      0.00      1672

    accuracy                           0.98     68099
   macro avg       0.49      0.50      0.49     68099
weighted avg       0.95      0.98      0.96     68099

Acc:  0.9754475102424411
MCC: 0.0
F1:  0.0


  'precision', 'predicted', average, warn_for)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  'precision', 'predicted', average, warn_for)


In [79]:
dataset_test = dataset_test[concatenated.columns]
x_submission = preprocess.transform(dataset_test)
y_pred_submission = model.predict(x_submission)
# Creating a dictionary where the keys are the account_ids
# and the values are your predictions
submission_account_ids = test_accounts.index
predictions = dict(zip(submission_account_ids, map(int, y_pred_submission)))

In [81]:
group_name = 'fRidaY'

In [82]:
# We validate first that we actually send all the test accounts expected to be sent
if y_pred_submission.shape[0] != 71683 or submission_account_ids.shape[0] != 71683:
  raise Exception("You have to send all of the accounts! Expected: (71683, 71683), Got: ({}, {})".format(y_pred_submission.shape[0], submission_account_ids.shape[0]))

if "group_name" not in vars() or group_name == "":
  group_name = input("Please enter your group's name:")

data = json.dumps({'submitter': group_name, 'predictions': predictions}).encode('utf-8')

req = request.Request("https://leaderboard.datahack.org.il/monday/api/",
                      headers={'Content-Type': 'application/json'},
                      data=data)

res = request.urlopen(req)
print(json.load(res))

{'member': 'fRidaY', 'rank': 1, 'score': 194854.59630942973}
