In [1]:
import pandas as pd
import os
import boto3 
from io import StringIO
import csv
import numpy as np
from datetime import datetime

# Reading Clean Sessions

In [3]:
sessions = pd.read_csv("../clean_sessions.csv")

In [6]:
sessions_df = sessions.drop(columns=["Unnamed: 0"])

# Cleaning Sessions (do not need to do again)

In [42]:
sessions = sessions_file.split("\n")
events = events_file.split("\n")

In [43]:
session_header = sessions[0].split(",")
event_header = events[0].split(",")

In [44]:
sessions = sessions[1:]
events = events[1:]

In [50]:
sessions = [row.split(",") for row in sessions]

In [64]:
session_header = session_header[1], session_header[2], session_header[5], session_header[6]\
, session_header[7], session_header[8], session_header[9], session_header[10], session_header[11], \
session_header[12], session_header[19], session_header[21]

In [68]:
new_sessions = []
for row in sessions:
    if len(row) == 22:
        new_sessions.append([row[1], row[2], row[5], row[6], row[7], row[8], row[9], row[10], row[11], 
                            row[12], row[19], row[21]])

In [134]:
sessions_df = pd.DataFrame(new_sessions, columns=session_header)

## Filtering

In [135]:
sessions_df = sessions_df.drop(sessions_df[(sessions_df.is_session == "false") | 
                                           (sessions_df.is_developer == "true") |
                                           (sessions_df.is_wau == "true") | 
                                           (sessions_df.is_mau == "true")].index)

In [136]:
sessions_df = sessions_df.drop(columns=["is_session", "is_developer", "is_wau", "is_mau"])

In [137]:
sessions_df.start_timestamp = sessions_df.start_timestamp.astype(float)
sessions_df.previous_sessions_duration = sessions_df.previous_sessions_duration.astype(float)
sessions_df.user_created_timestamp = sessions_df.user_created_timestamp.astype(float)

In [138]:
sessions_df.start_timestamp = pd.to_datetime(sessions_df.start_timestamp, unit='ms')
sessions_df.user_created_timestamp = pd.to_datetime(sessions_df.user_created_timestamp, unit='ms')

In [140]:
sessions_df = sessions_df.sort_values(by=["start_timestamp", "user_id_hash", "session_id"])

In [143]:
sessions_df.to_csv("clean_sessions.csv")

# Read Events

In [11]:
bucket_name = 'ml2-group8' # Add your bucket name
events = "events.csv"
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket_name) 
events_obj = bucket.Object(key=events) # S3 uses key-value structure where key is your file name
events_file = events_obj.get()["Body"].read().decode('utf-8') # Read the Body which is the contents of the file.

In [13]:
event_str = StringIO(events_file)

In [15]:
events_df = pd.read_csv(event_str, sep=",", error_bad_lines=False)

## Clean Events

In [18]:
events_df = events_df.drop(columns=["event_value"])

In [21]:
events_df.event_timestamp = pd.to_datetime(events_df.event_timestamp, unit='ms')

### Get purchase column

In [31]:
purchases = [1 if x == "8" else 0 for x in events_df.event]

In [33]:
events_df["purchases"] = purchases

In [318]:
events_df.to_csv("../clean_events.csv")

In [40]:
dec_events = events_df[events_df.event_timestamp >= "2018-12-01 00:00:00.000"].sort_values(by="event_timestamp")

### Split data 

In [43]:
week1 = dec_events[dec_events.event_timestamp < "2018-12-08 00:00:00.000"]
week2 = dec_events[dec_events.event_timestamp < "2018-12-15 00:00:00.000"]

In [66]:
week1_buyers = week1[["user_id_hash", "purchases"]].groupby("user_id_hash").sum()
week1_buyers["week1_purchaser"] = [1 if x > 0 else 0 for x in week1_buyers.purchases]
week1_buyers = week1_buyers.drop(columns=["purchases"])

In [67]:
week2_buyers = week2[["user_id_hash", "purchases"]].groupby("user_id_hash").sum()
week2_buyers["week2_purchaser"] = [1 if x > 0 else 0 for x in week2_buyers.purchases]
week2_buyers = week2_buyers.drop(columns=["purchases"])

In [68]:
dec_df = week1_buyers.join(week2_buyers, on="user_id_hash", how="outer")

### Create training set

In [71]:
train_df = sessions_df[sessions_df.start_timestamp < "2018-12-01 00:00:00.000"]

In [89]:
unique_users = pd.DataFrame(train_df["user_id_hash"].unique(), columns=["user_id_hash"])

### Get count of sesssions per user

In [91]:
session_counts = train_df[["user_id_hash", "session_index"]].groupby("user_id_hash").max()

In [92]:
unique_users = unique_users.join(session_counts, on="user_id_hash", how="outer")

### Get total duration of sessions per user

In [140]:
duration_sum = train_df[["user_id_hash", "previous_sessions_duration"]].groupby("user_id_hash").sum()

### Get max session

In [142]:
session_max = train_df[["user_id_hash", "session_index"]].groupby("user_id_hash").max()

### Duration / Session Ratio

In [145]:
dur_ses = duration_sum.join(session_max, on="user_id_hash", how="outer")

In [147]:
dur_ses["duration_session_ratio"] = dur_ses.previous_sessions_duration / dur_ses.session_index

In [149]:
dur_ses = dur_ses.drop(columns=["previous_sessions_duration", "session_index"])

In [151]:
unique_users = unique_users.join(dur_ses, on="user_id_hash", how="outer")

In [155]:
unique_users = unique_users.join(dec_df, on="user_id_hash", how="left").fillna(value=0)

In [158]:
unique_users.to_csv("../train.csv")

# Model Creation

In [262]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [228]:
train, test = unique_users.iloc[:495095], unique_users.iloc[495095:]

In [247]:
X_train = train.drop(columns=["user_id_hash", "week1_purchaser", "week2_purchaser", "user_enc", "hashed"])
X_test = test.drop(columns=["user_id_hash", "week1_purchaser", "week2_purchaser", "user_enc", "hashed"])
y1_train = train.week1_purchaser
y2_train = train.week2_purchaser
y1_test = test.week1_purchaser
y2_test = test.week2_purchaser

In [230]:
lr = LogisticRegression()

In [243]:
for col in X_train:
    X_train[col] = X_train[col].values

In [249]:
lr_week1 = lr.fit(X_train, y1_train.values)
lr_week2 = lr.fit(X_train, y2_train.values)



In [293]:
week1_pred = lr_week1.predict_proba(X_test)[:,0]
week1_pred2 = lr_week1.predict_proba(X_train)[:,0]
week2_pred = lr_week2.predict_proba(X_test)[:,0]
week2_pred2 = lr_week2.predict_proba(X_train)[:,0]

In [267]:
roc_auc_score(y1_test, week1_pred)

0.8503998173169813

In [268]:
roc_auc_score(y2_test, week2_pred)

0.8438595898785237

In [294]:
roc_auc_score(y1_train, week1_pred2)

0.9336167337075638

In [295]:
roc_auc_score(y2_train, week2_pred2)

0.9217193156098248

# Predictions

In [269]:
bucket_name = 'ml2-group8' # Add your bucket name
events = "sample_submission_2.csv"
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket_name) 
prediction_obj = bucket.Object(key=events) # S3 uses key-value structure where key is your file name
prediction_file = prediction_obj.get()["Body"].read().decode('utf-8') # Read the Body which is the contents of the file.

In [309]:
X = unique_users[["session_index", "duration_session_ratio"]]
y1 = unique_users.week1_purchaser
y2 = unique_users.week2_purchaser

In [310]:
full_week1 = lr.fit(X, y1)
full_week2 = lr.fit(X, y2)



In [313]:
full_week1_pred = full_week1.predict_proba(X)[:,0]
full_week2_pred = full_week2.predict_proba(X)[:,0]

In [277]:
predictions = prediction_file.split("\n")

In [278]:
prediction_header = predictions[0].split(",")
predictions = predictions[1:]

In [279]:
prediction_header

['user_id_hash', 'user_purchase_binary_7_days', 'user_purchase_binary_14_days']

In [280]:
for row in range(len(predictions)):
    predictions[row] = predictions[row].split(",")

In [282]:
pred_df = pd.DataFrame(predictions, columns=prediction_header)

In [319]:
pred_df.to_csv("../sample_submission.csv")

In [287]:
prediction_users = np.array(pred_df.user_id_hash.unique())

In [288]:
our_users = np.array(unique_users.user_id_hash)