In [22]:
import pandas as pd
from xgboost import XGBClassifier
import pickle

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [3]:
train_data = pd.read_parquet('sets/train_data.parquet')

In [8]:
X_train = train_data.drop(columns=['y', "transaction_id"])
y_train = train_data['y']

In [18]:
X_train.describe()

Unnamed: 0,category,alley_id,order_rate,reorder_rate,n_transactions,avg_items_in_transaction,last_transaction,n_transactions_since_last,n_transactions__client_item,reorder_rate__client_item,day_of_week,time_of_day
count,5632630.0,5632630.0,5632630.0,5632630.0,5632630.0,5632630.0,5632630.0,5632630.0,5632630.0,5632630.0,5632630.0,5632630.0
mean,10.206122,70.932405,0.006904775,0.539777,25.607568,11.10331,15.994724,9.612844,2.353513,0.299311,2.769439,13.580566
std,6.235037,38.076018,0.01715602,0.16176,22.350113,5.606028,17.329369,13.514316,3.246146,0.371861,2.163596,4.233456
min,1.0,1.0,3.354166e-07,0.0,1.0,0.25,1.0,0.0,1.0,0.0,0.0,0.0
25%,4.0,32.0,0.000315627,0.447738,9.0,7.15,4.0,1.0,1.0,0.0,1.0,10.0
50%,11.0,81.0,0.001300075,0.564538,18.0,10.035714,10.0,4.0,1.0,0.0,3.0,14.0
75%,16.0,106.0,0.005029572,0.65514,35.0,13.923077,21.0,12.0,2.0,0.666667,5.0,17.0
max,21.0,134.0,0.1361879,1.0,99.0,74.0,99.0,98.0,89.0,1.0,6.0,23.0


In [12]:
cat_features = ["category", "alley_id", "day_of_week", "time_of_day"]
num_features = [c for c in X_train.columns if c not in cat_features]

In [19]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
           # ("imputer", SimpleImputer(strategy='median')),
            ("min_max_scaler", MinMaxScaler())
        ]), num_features),
        ('cat', Pipeline([
           # ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=True))
        ]), cat_features),
    ],
    remainder='passthrough'
)

In [20]:
X_train_processed = preprocessor.fit_transform(X_train)



In [23]:
xgb = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    scale_pos_weight=y_train.value_counts()[0]/y_train.value_counts()[1]
)

xgb.fit(X_train_processed, y_train)

classifier = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', xgb),
])




with open('xgboost.pkl', 'wb') as f:
    pickle.dump(classifier, f)

In [None]:
with open('xgboost.pkl', 'r') as f:
    clf =