In [7]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix
import plotly.express as px

In [2]:
df = pd.read_csv("../src/conversion_data_train.csv")

## EDA

In [3]:
df.shape

(284580, 6)

In [4]:
df.head()

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,China,22,1,Direct,2,0
1,UK,21,1,Ads,3,0
2,Germany,20,0,Seo,14,1
3,US,23,1,Seo,3,0
4,US,28,1,Direct,3,0


In [5]:
(df.isna().sum() / df.shape[0]).apply(lambda x: f"{round(x * 100)} %")

country                0 %
age                    0 %
new_user               0 %
source                 0 %
total_pages_visited    0 %
converted              0 %
dtype: object

In [6]:
df_sample = df.sample(10000)

In [8]:
for column in df_sample.columns:
    fig = px.histogram(df_sample[column])
    fig.show()

## Pipeline

In [16]:
numeric_cols = ["age", "new_user", "total_pages_visited"]
categorical_cols = ["country", "source"]

features = [numeric_cols[-1]]
target = "converted"

X = df[features]
Y = df[target]

In [32]:
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.1,
                                                    random_state=0)

In [33]:
numeric_encoder = StandardScaler()

## Training

In [34]:
X_train = numeric_encoder.fit_transform(X_train)

In [35]:
model = LogisticRegression()
model.fit(X_train, Y_train)

In [36]:
Y_train_pred = model.predict(X_train)

## Testing

In [37]:
X_test = numeric_encoder.transform(X_test)

In [38]:
Y_test_pred = model.predict(X_test)

## Performances

In [39]:
print("f1-score on train set :", f1_score(Y_train, Y_train_pred))
print("f1-score on test set :", f1_score(Y_test, Y_test_pred))

f1-score on train set : 0.6938517686692869
f1-score on test set : 0.7060240963855422


In [40]:
print("Confusion matrix on train set : ")
print(confusion_matrix(Y_train, Y_train_pred))
print("Confusion matrix on test set : ")
print(confusion_matrix(Y_test, Y_test_pred))

Confusion matrix on train set : 
[[246817   1082]
 [  3280   4943]]
Confusion matrix on test set : 
[[27384   117]
 [  371   586]]


## In production

In [None]:
df_prod = pd.read_csv("../src/conversion_data_test.csv")
X_prod = df_prod[features].values

numeric_encoder.transform(X_prod)
Y_prod_pred = model.predict(X_prod)

In [49]:
df_predictions = pd.DataFrame(Y_prod_pred, columns=["Prediction"])
df_predictions.to_csv("../deliveries/conversion_data_test_predictions_TRISTAN-logregbaseline.csv", index=False)