In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime
import holidays

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder

# Loading Data

In [2]:
# Loading Train and Test
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.head()

Unnamed: 0,row_id,date,country,store,product,num_sold
0,0,2017-01-01,Belgium,KaggleMart,Kaggle Advanced Techniques,663
1,1,2017-01-01,Belgium,KaggleMart,Kaggle Getting Started,615
2,2,2017-01-01,Belgium,KaggleMart,Kaggle Recipe Book,480
3,3,2017-01-01,Belgium,KaggleMart,Kaggle for Kids: One Smart Goose,710
4,4,2017-01-01,Belgium,KaggleRama,Kaggle Advanced Techniques,240


In [4]:
train.tail()

Unnamed: 0,row_id,date,country,store,product,num_sold
70123,70123,2020-12-31,Spain,KaggleMart,Kaggle for Kids: One Smart Goose,614
70124,70124,2020-12-31,Spain,KaggleRama,Kaggle Advanced Techniques,215
70125,70125,2020-12-31,Spain,KaggleRama,Kaggle Getting Started,158
70126,70126,2020-12-31,Spain,KaggleRama,Kaggle Recipe Book,135
70127,70127,2020-12-31,Spain,KaggleRama,Kaggle for Kids: One Smart Goose,202


In [5]:
test.head()

Unnamed: 0,row_id,date,country,store,product
0,70128,2021-01-01,Belgium,KaggleMart,Kaggle Advanced Techniques
1,70129,2021-01-01,Belgium,KaggleMart,Kaggle Getting Started
2,70130,2021-01-01,Belgium,KaggleMart,Kaggle Recipe Book
3,70131,2021-01-01,Belgium,KaggleMart,Kaggle for Kids: One Smart Goose
4,70132,2021-01-01,Belgium,KaggleRama,Kaggle Advanced Techniques


# EDA

In [6]:
train.shape

(70128, 6)

In [7]:
test.shape

(17520, 5)

In [8]:
train['store'].unique()

array(['KaggleMart', 'KaggleRama'], dtype=object)

In [9]:
train['country'].unique()

array(['Belgium', 'France', 'Germany', 'Italy', 'Poland', 'Spain'],
      dtype=object)

In [10]:
train['product'].unique()

array(['Kaggle Advanced Techniques', 'Kaggle Getting Started',
       'Kaggle Recipe Book', 'Kaggle for Kids: One Smart Goose'],
      dtype=object)

Data contains combinations of store, country, and products for every date. Training data runs from 2017, 2018, 2019, 2020. Test data is 2021.

# Preprocessing

In [20]:
# Setting up X and Y for train and Test 
X_train = train[train.columns.drop(['row_id','num_sold'])]
y_train = train['num_sold']
X_test = test[test.columns.drop(['row_id'])]

In [21]:
### Covid feature seems to hurt results
# Fuction that adds covid feature
def covid(date):
    year, month, day = date.split('-')
    if int(year) > 2020:
        return 1
    if int(year) < 2020:
        return 0
    if int(month) > 3:
        return 1
    if int(month) < 3:
        return 0
    if int(day) < 19:
        return 0
    return 1

In [22]:
# Between Christmas and New Years
def christmas_new_year(date):
    year, month, day = date.split('-')
    if int(month) == 12 and int(day) > 25:
        return 1
    if int(month) == 1 and int(day) == 1:
        return 1
    return 0

#### Adding Day of Week

0 = monday, 1 = tuesday, etc

In [23]:
def year_month_day(df):
    """Adds year month day and day of week
    Drops date column"""
    df = df.copy()
    df['year'] = df['date'].map(lambda x: x.split('-')[0])
    df['month'] = df['date'].map(lambda x: int(x.split('-')[1]))
    df['day_num'] = df['date'].map(lambda x: x.split('-')[2])
    df['day_name'] = df['date'].map(lambda x: datetime.datetime(int(x.split('-')[0]), 
                                                                int(x.split('-')[1]),
                                                                int(x.split('-')[2])).weekday())
    df['day_of_week_sin'] = np.sin(df['day_name'] * (2 * np.pi / 7))
    df['day_of_week_cos'] = np.cos(df['day_name'] * (2 * np.pi / 7))
    df['day_of_month_sin'] = np.sin(df['month'] * (2 * np.pi / 12))
    df['day_of_month_cos'] = np.cos(df['month'] * (2 * np.pi / 12))
    #df['covid'] = df.apply(lambda row: covid(row['date']), axis = 1)
    df['christmas_time'] = df.apply(lambda row: christmas_new_year(row['date']), axis = 1)
    df = df[df.columns.drop(['date', 'day_name', 'month','day_num'])]
    
    return df

In [24]:
# Exporting to csv to explore
def process_and_export(df,name):
    """Adds year month day and day of week
    Drops date column"""
    df = df.copy()
    df['year'] = df['date'].map(lambda x: x.split('-')[0])
    df['month'] = df['date'].map(lambda x: int(x.split('-')[1]))
    df['day_num'] = df['date'].map(lambda x: x.split('-')[2])
    df['day_name'] = df['date'].map(lambda x: datetime.datetime(int(x.split('-')[0]), 
                                                                int(x.split('-')[1]),
                                                                int(x.split('-')[2])).weekday())
    df['day_of_week_sin'] = np.sin(df['day_name'] * (2 * np.pi / 7))
    df['day_of_week_cos'] = np.cos(df['day_name'] * (2 * np.pi / 7))
    df['day_of_month_sin'] = np.sin(df['month'] * (2 * np.pi / 12))
    df['day_of_month_cos'] = np.cos(df['month'] * (2 * np.pi / 12))
    df['covid'] = df.apply(lambda row: covid(row['date']), axis = 1)
    df['holiday'] = df.apply(lambda row: holiday(row['country'], row['date']), axis = 1)
    #df = df[df.columns.drop(['date', 'day_name', 'month','day_num'])]
    df.to_csv(name + '.csv', index = False)

#X_train['y'] = y_train
#process_and_export(X_train, 'training_for_eda')

In [25]:
X_train = year_month_day(X_train)
X_test = year_month_day(X_test)

In [26]:
X_train

Unnamed: 0,country,store,product,year,day_of_week_sin,day_of_week_cos,day_of_month_sin,day_of_month_cos,christmas_time
0,Belgium,KaggleMart,Kaggle Advanced Techniques,2017,-0.781831,0.623490,5.000000e-01,0.866025,1
1,Belgium,KaggleMart,Kaggle Getting Started,2017,-0.781831,0.623490,5.000000e-01,0.866025,1
2,Belgium,KaggleMart,Kaggle Recipe Book,2017,-0.781831,0.623490,5.000000e-01,0.866025,1
3,Belgium,KaggleMart,Kaggle for Kids: One Smart Goose,2017,-0.781831,0.623490,5.000000e-01,0.866025,1
4,Belgium,KaggleRama,Kaggle Advanced Techniques,2017,-0.781831,0.623490,5.000000e-01,0.866025,1
...,...,...,...,...,...,...,...,...,...
70123,Spain,KaggleMart,Kaggle for Kids: One Smart Goose,2020,0.433884,-0.900969,-2.449294e-16,1.000000,1
70124,Spain,KaggleRama,Kaggle Advanced Techniques,2020,0.433884,-0.900969,-2.449294e-16,1.000000,1
70125,Spain,KaggleRama,Kaggle Getting Started,2020,0.433884,-0.900969,-2.449294e-16,1.000000,1
70126,Spain,KaggleRama,Kaggle Recipe Book,2020,0.433884,-0.900969,-2.449294e-16,1.000000,1


In [28]:
# Splitting Categorical and Numerics
categoricals = ['country', 'store', 'product']
numerics = ['year', 'day_of_week_sin', 'day_of_week_cos', 'day_of_month_sin', 'day_of_month_cos','christmas_time']

X_train_cat = X_train[categoricals]
X_train_num = X_train[numerics]

X_test_cat = X_test[categoricals]
X_test_num = X_test[numerics]

In [29]:
# One hot encoding and combining
enc = OneHotEncoder()

X_train_cat = enc.fit_transform(X_train_cat)
X_test_cat = enc.fit_transform(X_test_cat)

In [30]:
X_train = pd.concat([pd.DataFrame(X_train_cat.todense()), X_train_num], axis = 1)
X_test = pd.concat([pd.DataFrame(X_test_cat.todense()), X_test_num], axis = 1)

# Model

In [31]:
# Defining Classifier
clf = RandomForestClassifier()

In [32]:
# Fitting and Predicting
clf.fit(X_train, y_train)

RandomForestClassifier()

In [33]:
num_sold = clf.predict(X_test)

In [34]:
row_id = test['row_id']

# Submission File

In [35]:
def write_submission(row_id, num_sold):
    """function to write submission from predictions"""
    ans = pd.concat([row_id, num_sold], axis = 1)
    ans.to_csv('submission.csv', index = False)

In [36]:
write_submission(row_id, pd.Series(num_sold, name = 'num_sold'))