In [63]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate
import plotly.express as px
from statsmodels.tsa.seasonal import seasonal_decompose
from pandas.plotting import autocorrelation_plot
from sklearn.preprocessing import StandardScaler
from prophet import Prophet
from datetime import datetime

In [64]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

sub_file = pd.read_csv('data/sample_submission.csv')


In [65]:
def transform_data(data):
    if 'num_sold' in data.columns: 
        data.fillna({'num_sold':0}, inplace=True)

    data['month'] = pd.to_datetime(data['date']).dt.month
    data['year'] = pd.to_datetime(data['date']).dt.year
    data['season'] = data['month'].map({1: 'Winter', 2: 'Winter', 3: 'Spring', 4: 'Spring', 5: 'Spring', 6: 'Summer', 7: 'Summer', 8: 'Summer', 9: 'Autumn', 10: 'Autumn', 11: 'Autumn', 12: 'Winter'})

    # change season to categorical
    data['season'] = data['season'].astype('category')
    data['season'] = data['season'].cat.codes

    # change product to categorical
    data['product'] = data['product'].astype('category')
    data['product'] = data['product'].cat.codes

    # change country to categorical
    data['country'] = data['country'].astype('category')
    data['country'] = data['country'].cat.codes

    # change store to categorical
    data['store'] = data['store'].astype('category')
    data['store'] = data['store'].cat.codes

    # change store to categorical
    data['year'] = data['year'].astype('category')
    data['year'] = data['year'].cat.codes

    data[['country', 'store', 'product',  'month', 'year', 'season']] = data[['country', 'store', 'product',  'month', 'year', 'season']].astype(int)

    return data

train = transform_data(train)
test = transform_data(test)

In [66]:
train

Unnamed: 0,id,date,country,store,product,num_sold,month,year,season
0,0,2010-01-01,0,0,0,0.0,1,0,3
1,1,2010-01-01,0,0,1,973.0,1,0,3
2,2,2010-01-01,0,0,2,906.0,1,0,3
3,3,2010-01-01,0,0,3,423.0,1,0,3
4,4,2010-01-01,0,0,4,491.0,1,0,3
...,...,...,...,...,...,...,...,...,...
230125,230125,2016-12-31,5,1,0,466.0,12,6,3
230126,230126,2016-12-31,5,1,1,2907.0,12,6,3
230127,230127,2016-12-31,5,1,2,2299.0,12,6,3
230128,230128,2016-12-31,5,1,3,1242.0,12,6,3


# Scaling

In [67]:
# log scale train and test
train_log = train.copy()
test_log = test.copy()

train_log[['country', 'store', 'product',  'month', 'year', 'season','num_sold']] = train_log[['country', 'store', 'product',  'month', 'year', 'season', 'num_sold']] + 1
test_log[['country', 'store', 'product',  'month', 'year', 'season']] = test_log[['country', 'store', 'product',  'month', 'year', 'season']] + 1

train_log[['country', 'store', 'product',  'month', 'year', 'season','num_sold']] = np.log(train_log[['country', 'store', 'product',  'month', 'year', 'season','num_sold']])
test_log[['country', 'store', 'product',  'month', 'year', 'season']] = np.log(test_log[['country', 'store', 'product',  'month', 'year', 'season']])


In [68]:
# start scaling 
scaler = StandardScaler()

train_log[['country', 'store', 'product',  'month', 'year', 'season']] = scaler.fit_transform(train_log[['country', 'store', 'product',  'month', 'year', 'season']])
test_log[['country', 'store', 'product',  'month', 'year', 'season']] = scaler.transform(test_log[['country', 'store', 'product',  'month', 'year', 'season']])

train[['country', 'store', 'product',  'month', 'year', 'season']] = scaler.fit_transform(train[['country', 'store', 'product',  'month', 'year', 'season']])
test[['country', 'store', 'product',  'month', 'year', 'season']] = scaler.transform(test[['country', 'store', 'product',  'month', 'year', 'season']])

# Prophet model

In [69]:
# rename date and target column 
train_log = train_log.rename(columns={'date': 'ds', 'num_sold': 'y'})
test_log = test_log.rename(columns={'date': 'ds', 'num_sold': 'y'})

train = train.rename(columns={'date': 'ds', 'num_sold': 'y'})
test = test.rename(columns={'date': 'ds', 'num_sold': 'y'})

In [70]:
train = train.drop(columns=['id'])
test = test.drop(columns=['id'])

In [71]:
# Initialize and fit the Prophet model
model = Prophet()

for each in ['country', 'store', 'product',  'month', 'year', 'season']:
    model.add_regressor(each)

model.fit(train)



11:53:09 - cmdstanpy - INFO - Chain [1] start processing
11:53:29 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x2880a8bf0>

In [72]:

# predict y given a df of x values
test_pred = model.predict(test)
test_pred

Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,country,...,year,year_lower,year_upper,yearly,yearly_lower,yearly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
0,2017-01-01,775.063851,-207.989887,1530.479869,775.063851,775.063851,-122.136828,-122.136828,-122.136828,-148.478579,...,112.630461,112.630461,112.630461,88.452206,88.452206,88.452206,0.0,0.0,0.0,652.927023
1,2017-01-01,775.063851,-178.881652,1576.192522,775.063851,775.063851,-91.495778,-91.495778,-91.495778,-148.478579,...,112.630461,112.630461,112.630461,88.452206,88.452206,88.452206,0.0,0.0,0.0,683.568072
2,2017-01-01,775.063851,-129.568713,1559.728066,775.063851,775.063851,-60.854729,-60.854729,-60.854729,-148.478579,...,112.630461,112.630461,112.630461,88.452206,88.452206,88.452206,0.0,0.0,0.0,714.209122
3,2017-01-01,775.063851,-126.828562,1618.499773,775.063851,775.063851,-30.213679,-30.213679,-30.213679,-148.478579,...,112.630461,112.630461,112.630461,88.452206,88.452206,88.452206,0.0,0.0,0.0,744.850172
4,2017-01-01,775.063851,-123.431293,1636.022053,775.063851,775.063851,0.427371,0.427371,0.427371,-148.478579,...,112.630461,112.630461,112.630461,88.452206,88.452206,88.452206,0.0,0.0,0.0,775.491221
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98545,2019-12-31,829.942270,170.182935,2040.892710,413.006966,1249.362726,211.032918,211.032918,211.032918,148.478579,...,37.563059,37.563059,37.563059,77.842639,77.842639,77.842639,0.0,0.0,0.0,1040.975188
98546,2019-12-31,829.942270,181.038838,2022.020384,413.000536,1249.376932,241.673967,241.673967,241.673967,148.478579,...,37.563059,37.563059,37.563059,77.842639,77.842639,77.842639,0.0,0.0,0.0,1071.616238
98547,2019-12-31,829.942270,177.204362,2079.265893,412.994107,1249.391138,272.315017,272.315017,272.315017,148.478579,...,37.563059,37.563059,37.563059,77.842639,77.842639,77.842639,0.0,0.0,0.0,1102.257287
98548,2019-12-31,829.942270,165.588790,2053.544006,412.987677,1249.405344,302.956067,302.956067,302.956067,148.478579,...,37.563059,37.563059,37.563059,77.842639,77.842639,77.842639,0.0,0.0,0.0,1132.898337


In [73]:
# non log submission 1 
sub_file['num_sold'] = test_pred['yhat']
# add timestamp to file name 
file_name = datetime.now().strftime('submissions/submission_%Y%m%d_%H%M.csv')
sub_file.to_csv(file_name, index=False)