# Training the model

In [2]:
import pickle
import pandas as pd
from my_utilities import build_full_data

In [3]:
# Load both the training and test data, and combine them
with open('cleaned_data/train_data.pickle', 'rb') as f:
    data = pickle.load(f)
df_train = data['df']
unique_items = data['unique_items']
with open('cleaned_data/test_data.pickle', 'rb') as f:
    data = pickle.load(f)
df_test = data['df']
df_raw = pd.concat([df_train, df_test])
df_raw.sort_values(by='datetime', inplace=True)

In [4]:
%%time
# Build the full training data accounting for multiple first items
X, y = build_full_data(df_raw, unique_items, max_n_first_items=10)

CPU times: user 41.8 s, sys: 4.62 s, total: 46.4 s
Wall time: 47.8 s


In [7]:
%%time
# Train a logistic regression model for each potential purchase item
from sklearn.linear_model import LogisticRegression
from my_utilities import train_for_each_column
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
model = LogisticRegression(max_iter=500, C=1)
print(f"Training {model} for each of the {len(y.columns)} target columns:")
X_scaled = scaler.fit_transform(X)
models = train_for_each_column(model, X_scaled, y)

Training LogisticRegression(C=1, max_iter=500) for each of the 3000 target columns:


100% (3000 of 3000) |####################| Elapsed Time: 0:05:13 Time:  0:05:13


CPU times: user 33min 12s, sys: 4min 46s, total: 37min 58s
Wall time: 5min 14s


In [8]:
# Save the trained model (including scaler)
data = {'scaler': scaler, 'models': models}
with open('trained_model.pickle', 'wb') as f:
    pickle.dump(data, f)

In [9]:
# Save also the full training data
import gzip
data = {'X': X, 'y': y}
with gzip.open('full_train_data.pickle.gz', 'wb') as f:
    pickle.dump(data, f)