In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold

from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV

from sklearn.tree import DecisionTreeRegressor

In [2]:
pd.options.mode.chained_assignment = None

In [27]:
campaigns = pd.read_csv('data/campaigns.csv')

In [28]:
campaigns = campaigns.drop(['UTCDATE', 'CAMPAIGNID', 'CREATIVEID', 'SDK'], axis=1)

In [29]:
campaigns = pd.get_dummies(campaigns, columns=['PLATFORM'])

In [30]:
t = round(0.8*campaigns.shape[0])
c = campaigns.iloc[:t, :]
cX = c.loc[:,c.columns != 'SESSIONSWITHINTERACTION'].to_numpy()
cy = c.loc[:,'SESSIONSWITHINTERACTION'].to_numpy()
c_test = campaigns.iloc[t:, :]
cX_test = c_test.loc[:,c_test.columns != 'SESSIONSWITHINTERACTION'].to_numpy()
cy_test = c_test.loc[:,'SESSIONSWITHINTERACTION'].to_numpy()

In [38]:
lr = LinearRegression().fit(cX, cy)
y_predlr = lr.predict(cX_test)
print(f"RMSE: {np.sqrt(mean_squared_error(cy_test, y_predlr))}")

RMSE: 286.691817707513


In [38]:
LASSO_LR_CONFIG = dict(
    max_iter=1e7,
    alphas=[0.0001, 0.001, 0.01, 0.1, 0.5, 1, 3, 5, 10, 20, 50],
    random_state=42,
    cv=KFold(n_splits=10)
)

lasso_reg = LassoCV(**LASSO_LR_CONFIG)
lasso_reg.fit(cX, cy)

y_lasso = lasso_reg.predict(cX_test)

In [39]:
print(f"RMSE: {np.sqrt(mean_squared_error(cy_test, y_lasso))}")

RMSE: 286.69181373431667


In [34]:
RIDGE_LR_CONFIG = dict(
    alphas=[1e-3, 5e-2, 1e-2, 0.1, 0.3, 1, 3, 5, 10, 20, 30, 50],
    cv=KFold(n_splits=10)
)
ridge_reg = RidgeCV(**RIDGE_LR_CONFIG)
ridge_reg.fit(cX, cy)


In [40]:
y_ridge = ridge_reg.predict(cX_test)

print(f"RMSE: {np.sqrt(mean_squared_error(cy_test, y_ridge))}")

RMSE: 286.76473151119797


Mean value predictor:

In [41]:
print(f"RMSE: {np.sqrt(mean_squared_error(cy_test, np.repeat(np.mean(cy), len(cy_test))))}")

RMSE: 295.3052617977782


Median value predictor:

In [42]:
print(f"RMSE: {np.sqrt(mean_squared_error(cy_test, np.repeat(np.median(cy), len(cy_test))))}")


RMSE: 296.34772337260983


In [43]:
dt = DecisionTreeRegressor(criterion='friedman_mse').fit(cX, cy)
y_dt = dt.predict(cX_test)

print(f"RMSE: {np.sqrt(mean_squared_error(cy_test, y_dt))}")

RMSE: 380.4075988733357


In [44]:
dtmse = DecisionTreeRegressor(criterion='mse').fit(cX, cy)
y_dt = dtmse.predict(cX_test)
np.sqrt(mean_squared_error(cy_test, y_dt))
print(f"RMSE: {np.sqrt(mean_squared_error(cy_test, y_dt))}")

RMSE: 381.1859649797657


In [54]:
sample_index = np.random.choice(range(cX.shape[0]), 20000, replace=False)
samp = cX[sample_index, :]
y_sample = cy[sample_index]
dtmae = DecisionTreeRegressor(criterion='mae', max_depth=100).fit(samp, y_sample)
y_dt = dtmae.predict(cX_test)

print(f"RMSE: {np.sqrt(mean_squared_error(cy_test, y_dt))}")

RMSE: 355.8351330527831


In [31]:
knr = KNeighborsRegressor().fit(cX, cy)

In [55]:
y_knr = dt.predict(cX_test)

print(f"RMSE: {np.sqrt(mean_squared_error(cy_test, y_knr))}")

RMSE: 380.4075988733357


Models trained only where there is 10 or more requested sessions.

In [47]:
campaigns_big = campaigns[campaigns['REQUESTEDSESSIONS'] >= 10]
print(f"Percentage of rows with less than 10 requested sessions: {round(campaigns.shape[0] - campaigns_big.shape[0]) / campaigns.shape[0] * 100}")

Percentage of rows with less than 10 requested sessions: 31.614593718253193


In [48]:
tb = round(0.8*campaigns_big.shape[0])
cb = campaigns_big.iloc[:tb, :]
cXb = cb.loc[:,cb.columns != 'SESSIONSWITHINTERACTION'].to_numpy()
cyb = cb.loc[:,'SESSIONSWITHINTERACTION'].to_numpy()
cb_test = campaigns_big.iloc[tb:, :]
cXb_test = cb_test.loc[:,cb_test.columns != 'SESSIONSWITHINTERACTION'].to_numpy()
cyb_test = cb_test.loc[:,'SESSIONSWITHINTERACTION'].to_numpy()

In [49]:
lr = LinearRegression().fit(cXb, cyb)
yb_predlr = lr.predict(cXb_test)
print(f"RMSE: {np.sqrt(mean_squared_error(cyb_test, yb_predlr))}")

RMSE: 344.0679552089317


In [50]:
LASSO_LR_CONFIG = dict(
    max_iter=1e7,
    alphas=[0.0001, 0.001, 0.01, 0.1, 0.5, 1, 3, 5, 10, 20, 50],
    random_state=42,
    cv=KFold(n_splits=10)
)

lasso_reg = LassoCV(**LASSO_LR_CONFIG)
lasso_reg.fit(cXb, cyb)

y_lasso = lasso_reg.predict(cXb_test)

print(f"RMSE: {np.sqrt(mean_squared_error(cyb_test, y_lasso))}")

RMSE: 344.0679510056945


In [55]:
RIDGE_LR_CONFIG = dict(
    alphas=[1e-3, 5e-2, 1e-2, 0.1, 0.3, 1, 3, 5, 10, 20, 30, 50],
    cv=KFold(n_splits=10)
)
ridge_reg = RidgeCV(**RIDGE_LR_CONFIG)
ridge_reg.fit(cXb, cyb)

In [52]:
y_ridge = ridge_reg.predict(cXb_test)
print(f"RMSE: {np.sqrt(mean_squared_error(cyb_test, y_ridge))}")



RMSE: 344.06795520862636


Mean value predictor:

In [53]:
print(f"RMSE: {np.sqrt(mean_squared_error(cyb_test, np.repeat(np.mean(cyb), len(cyb_test))))}")

RMSE: 353.7579901949412


Median value predictor:

In [46]:
print(f"RMSE: {np.sqrt(mean_squared_error(cyb_test, np.repeat(np.median(cyb), len(cyb_test))))}")

RMSE: 355.3753394750715


In [56]:
dt = DecisionTreeRegressor(criterion='friedman_mse').fit(cXb, cyb)
y_dt = dt.predict(cXb_test)
print(f"RMSE: {np.sqrt(mean_squared_error(cyb_test, y_dt))}")

RMSE: 455.8194439655283


In [57]:
dtmse = DecisionTreeRegressor(criterion='mse').fit(cXb, cyb)
y_dt = dtmse.predict(cXb_test)
print(f"RMSE: {np.sqrt(mean_squared_error(cyb_test, y_dt))}")

RMSE: 454.08468710995095


In [58]:
sample_index = np.random.choice(range(cXb.shape[0]), 20000, replace=False)
samp = cXb[sample_index, :]
y_sample = cyb[sample_index]
dtmae = DecisionTreeRegressor(criterion='mae', max_depth=100).fit(samp, y_sample)
y_dt = dtmae.predict(cXb_test)
print(f"RMSE: {np.sqrt(mean_squared_error(cyb_test, y_dt))}")

RMSE: 427.48432121180736


In [59]:
knr = KNeighborsRegressor().fit(cXb, cyb)

In [60]:
y_knr = dt.predict(cXb_test)
print(f"RMSE: {np.sqrt(mean_squared_error(cyb_test, y_knr))}")

RMSE: 455.8194439655283
