In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


In [7]:
train = pd.read_csv('data/train.csv')

In [10]:
train.head()

Unnamed: 0,id,date,country,store,product,num_sold
0,0,2010-01-01,Canada,Discount Stickers,Holographic Goose,
1,1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0
2,2,2010-01-01,Canada,Discount Stickers,Kaggle Tiers,906.0
3,3,2010-01-01,Canada,Discount Stickers,Kerneler,423.0
4,4,2010-01-01,Canada,Discount Stickers,Kerneler Dark Mode,491.0


In [16]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230130 entries, 0 to 230129
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   id        230130 non-null  int64  
 1   date      230130 non-null  object 
 2   country   230130 non-null  object 
 3   store     230130 non-null  object 
 4   product   230130 non-null  object 
 5   num_sold  221259 non-null  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 10.5+ MB


In [15]:
train.isna().sum() / len(train)

id          0.000000
date        0.000000
country     0.000000
store       0.000000
product     0.000000
num_sold    0.038548
dtype: float64

In [28]:
[(i, train[i].unique()) if train[i].nunique() <= 10 else (i, train[i].nunique()) for i in train.columns]

[('id', 230130),
 ('date', 2557),
 ('country',
  array(['Canada', 'Finland', 'Italy', 'Kenya', 'Norway', 'Singapore'],
        dtype=object)),
 ('store',
  array(['Discount Stickers', 'Stickers for Less', 'Premium Sticker Mart'],
        dtype=object)),
 ('product',
  array(['Holographic Goose', 'Kaggle', 'Kaggle Tiers', 'Kerneler',
         'Kerneler Dark Mode'], dtype=object)),
 ('num_sold', 4037)]

In [55]:
X_train = train.drop(['id', 'num_sold'], axis=1)
y_train = train.num_sold

In [61]:
class AddDateFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_new = X.copy()
        X_new['date'] = pd.to_datetime(X_new.date)
        X_new['year'] = X_new.date.dt.year
        X_new['month'] = X_new.date.dt.month
        X_new['day'] = X_new.date.dt.day
        X_new.drop('date', inplace=True)

        return X_new

In [67]:
X_train.columns

Index(['date', 'country', 'store', 'product'], dtype='object')

In [None]:
categorical_columns = ['country', 'store', 'product']
categorical_columns_order = ['country', 'store', 'product']

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_columns)
    ])

In [62]:
pipeline = Pipeline([
    ('date', AddDateFeatures()),
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

In [63]:
pipeline.fit(X_train, y_train)

In [64]:
pipeline.named_steps['date'].transform(X_train)

Unnamed: 0,date,country,store,product,year,month,day
0,2010-01-01,Canada,Discount Stickers,Holographic Goose,2010,1,1
1,2010-01-01,Canada,Discount Stickers,Kaggle,2010,1,1
2,2010-01-01,Canada,Discount Stickers,Kaggle Tiers,2010,1,1
3,2010-01-01,Canada,Discount Stickers,Kerneler,2010,1,1
4,2010-01-01,Canada,Discount Stickers,Kerneler Dark Mode,2010,1,1
...,...,...,...,...,...,...,...
230125,2016-12-31,Singapore,Premium Sticker Mart,Holographic Goose,2016,12,31
230126,2016-12-31,Singapore,Premium Sticker Mart,Kaggle,2016,12,31
230127,2016-12-31,Singapore,Premium Sticker Mart,Kaggle Tiers,2016,12,31
230128,2016-12-31,Singapore,Premium Sticker Mart,Kerneler,2016,12,31
