# A real-world example of predicting Sales volume with Random Forest Regression on a JupyterNotebook

* Ref: [https://medium.com/@oemer.aslantas/forecasting-sales-units-with-random-forest-regression-on-python-a75d92910b46](https://medium.com/@oemer.aslantas/forecasting-sales-units-with-random-forest-regression-on-python-a75d92910b46)
* Data: 沒有提供 dataset

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from IPython.display import display
import numpy as np
from datetime import datetime
from sklearn.metrics import mean_squared_error
from pandas.tseries.offsets import *

In [2]:
from german_holidays import get_german_holiday_calendar

ModuleNotFoundError: ignored

## 1. Load data

In [None]:
df = pd.read_excel('SalesData.xlsx',
                   parse_dates=True,
                   squeeze=True)
df_test = pd.read_excel('prediction_empty.xlsx',
                        parse_dates=True,
                        squeeze=True)

In [None]:
df.head()

## 2. Data propessing

In [None]:
# Converting to date time
df['Date'] = pd.to_datetime(df['From']).dt.date
df_test['Date'] = pd.to_datetime(df_test['Date']).dt.date

In [None]:
df.head()

In [None]:
# Group by date and sum up the sold units
df = df.groupby('Date')['Solid Units'].sum().reset_index()

In [None]:
df.head()

In [None]:
# Creating German holidays
cal_cls = get_german_holiday_calendar('NW')
cal = cal_cls()
ger_holidays = [d.date() for d in cal.holidays()]

# Checking the holidays
df['Holiday'] = df['Date'].isin(ger_holidays)
df_test['Holiday'] = df_test['Date'].isin(ger_holidays)

# Removing the holidays
df = df[df['Holiday']==False]
df_test = df_test[df_test['Holiday']==False]

* 因為只有 Date 和 Sold Units 兩個欄位，而 Sold Units 是要預測的欄位，所以必須要自己建立新的 features
  * 從 Date 欄位來建立新的 features

In [None]:
# Extracting more info
df['Year'] = pd.to_datetime(df['Date']).dt.year
df['Week'] = pd.to_datetime(df['Date']).dt.week
df['Day'] = pd.to_datetime(df['Date']).dt.day
df['WeekDay'] = pd.to_datetime(df['Date']).dt.dayofweek

df_test['Year'] = pd.to_datetime(df_test['Date']).dt.year
df_test['Week'] = pd.to_datetime(df_test['Date']).dt.week
df_test['Day'] = pd.to_datetime(df_test['Date']).dt.day
df_test['WeekDay'] = pd.to_datetime(df_test['Date']).dt.dayofweek

## 3. Data visualization

In [None]:
sns.set(rc={'figure.figsize': (10, 5)})
sns.boxplot(x=df['Sold Units'])

In [None]:
B = plt.boxplot(df['Sold Units])
[item.get_ydata() from item in B['whiskers']]

* 所以可以藉由 whiskers 的結果來找出 outlier 並且移除

In [None]:
# Removing outliers
df = df[df['Sold Units'] < 21357]
df = df[df['Sold Units'] > 681]

In [None]:
# Weekly trends
sns.lineplot(df['Week'], df['Sold Units'])

In [None]:
# Yearly trends
sns.lineplot(df['Year'], df['Sold Units'])

## 4. Compare models

* 用各種 models 來看用哪個 model 會比較好

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [None]:
# train test split
X = df.drop(['Holiday', 'Date', 'Solid Units'], axis=1)
y = df['Solid Units']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)

In [None]:
# Comparing algorithms
def scores(i):
    model = i()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)

    return r2

In [None]:
algs = [
    'LinearRegression',
    'KNeighborsRegressor',
    'RandomForestRegressor',
    'Lasso', 
    'ElasticNet',
    'DecisionTreeRegressor',
    'GradientBoostingRegressor'
]
s = []

for i in algs:
    s.append(scores(i))

In [None]:
# Checking the score
df_compare = pd.DataFrame({'Model': algs, 'R2_score': s})
df_compare.sort_values(by='R2_score', ascending=False)

## 5. RandomForest model

In [None]:
rf_model = RandomForestRegressor(oob_score=True,
                                 n_jobs=1,
                                 random_state=7,
                                 max_features='auto',
                                 min_samples_leaf=4)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

In [None]:
print(type(y_pred), y_pred)

In [None]:
# metrics
r2 = r2_score(y_test, y_pred)
print(r2)

In [None]:
def mean_percentage_error(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    return np.mean(np.array((y_true - y_pred) / y_pred)) * 100

In [None]:
mean_percentage_error(y_test, y_pred)

In [None]:
# Usa all data to predict and save result
df_test = df_test.drop(['Date', 'Sales'], axis=1)
y_pred_2 = model.predict(df_test)
df_test['Sales'] = y_pred_2.round(0)

In [None]:
df_test.head()

In [None]:
df_result = df_test[['Date'], 'Sales']

In [None]:
df_result.head()

In [None]:
df_result.to_csv('final_result.csv')