In [73]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, accuracy_score
import joblib

In [63]:
RANDOM_SEED = 42

In [64]:
df = pd.read_parquet('../data/processed/1_train_processed.parquet')

df

Unnamed: 0,id,sales,onpromotion,transactions,dcoilwtico,pay_day,store_nbr_1,store_nbr_2,store_nbr_3,store_nbr_4,...,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11
0,0,0.000,0.000000,0.202538,0.492910,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1,0.000,0.000000,0.202538,0.492910,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,2,0.000,0.000000,0.202538,0.492910,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,3,0.000,0.000000,0.202538,0.492910,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,4,0.000,0.000000,0.202538,0.492910,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3054343,3000883,438.133,0.000000,0.257362,0.253228,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3054344,3000884,154.553,0.001350,0.257362,0.253228,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3054345,3000885,2419.729,0.199730,0.257362,0.253228,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3054346,3000886,121.000,0.010796,0.257362,0.253228,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [65]:
X = df.drop(columns=['id', 'sales'])
y = df['sales']

X.shape, y.shape

((3054348, 189), (3054348,))

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2443478, 189), (610870, 189), (2443478,), (610870,))

In [67]:
len(df[df.sales == 0]) / len(df)

0.31194709967560996

In [68]:
%%time

from sklearn.ensemble import GradientBoostingClassifier

df_chunk = df.sample(10000, random_state=RANDOM_SEED)

df_chunk['is_zero'] = (df_chunk['sales'] == 0).astype(int)

X = df_chunk.drop(columns=['id', 'sales', 'is_zero'])
y = df_chunk['is_zero']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

train_acc = accuracy_score(y_true=y_train, y_pred=y_train_pred)
test_acc = accuracy_score(y_true=y_test, y_pred=y_test_pred)

f"{train_acc = :.2f}, {test_acc = :.2f}"

CPU times: user 1.14 s, sys: 16.4 ms, total: 1.16 s
Wall time: 1.19 s


'train_acc = 0.90, test_acc = 0.89'

In [69]:
from sklearn.ensemble import HistGradientBoostingRegressor

df_non_zero = df_chunk[df_chunk['sales'] > 0]

X_non_zero = df_non_zero.drop(columns=['sales', 'id', 'is_zero'])
y_non_zero = df_non_zero['sales']

X_train, X_test, y_train, y_test = train_test_split(X_non_zero, y_non_zero, test_size=0.2, random_state=42)

reg = HistGradientBoostingRegressor()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

f"{mae = :.2f}, {mape = :.2f}%"

'mae = 187.01, mape = 7.70%'

In [70]:
df['is_zero'] = (df['sales'] == 0).astype('bool')

In [71]:
%%time

from sklearn.ensemble import GradientBoostingClassifier

X = df.drop(columns=['id', 'sales', 'is_zero'])
y = df['is_zero']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

train_acc = accuracy_score(y_true=y_train, y_pred=y_train_pred)
test_acc = accuracy_score(y_true=y_test, y_pred=y_test_pred)

f"{train_acc = :.2f}, {test_acc = :.2f}"

CPU times: user 8min 39s, sys: 3.58 s, total: 8min 42s
Wall time: 8min 51s


'train_acc = 0.89, test_acc = 0.89'

In [72]:
%%time

from sklearn.ensemble import HistGradientBoostingRegressor

df_non_zero = df[df.is_zero == False]

X = df_non_zero.drop(columns=['sales', 'id', 'is_zero'])
y = df_non_zero['sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

hgbr = HistGradientBoostingRegressor()
hgbr.fit(X_train, y_train)
y_pred = hgbr.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

f"{mae = :.2f}, {mape = :.2f}%"

CPU times: user 2min 55s, sys: 8.79 s, total: 3min 4s
Wall time: 32.9 s


'mae = 132.80, mape = 5.18%'

In [74]:
joblib.dump(clf, '../models/clf_1.pkl')
joblib.dump(hgbr, '../models/hgbr_1.pkl')

['../models/hgbr_1.pkl']

In [75]:
from xgboost import XGBClassifier

X = df.drop(columns=['id', 'sales', 'is_zero'])
y = df['is_zero']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

xgbc = XGBClassifier()
xgbc.fit(X_train, y_train)

y_train_pred = xgbc.predict(X_train)
y_test_pred = xgbc.predict(X_test)

train_acc = accuracy_score(y_true=y_train, y_pred=y_train_pred)
test_acc = accuracy_score(y_true=y_test, y_pred=y_test_pred)

print(f"{train_acc = :.2f}, {test_acc = :.2f}")

train_acc = 0.92, test_acc = 0.92


In [76]:
from xgboost import XGBRegressor

df_non_zero = df[df.is_zero == False]

X = df_non_zero.drop(columns=['sales', 'id', 'is_zero'])
y = df_non_zero['sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

xgbr = XGBRegressor()
xgbr.fit(X_train, y_train)

y_pred = xgbr.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f"{mae = :.2f}, {mape = :.2f}%")

mae = 112.98, mape = 4.35%


In [77]:
joblib.dump(xgbc, '../models/xgbc_1.pkl')
joblib.dump(xgbr, '../models/xgbr_1.pkl')

['../models/xgbr_1.pkl']