# Notebook 06: Classification - Predicting Sales for New Models

This notebook develops a classification model and evaluates predictions for new model sales using attributes for watches in datasets cleaned in [Notebook 1: Data Cleaning](01_data_cleaning.ipynb).

In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer
from sklearn.metrics import ConfusionMatrixDisplay, recall_score, accuracy_score, balanced_accuracy_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB

# custom function imports
from custom_functions import *
from regression_custom_functions import *

In [51]:
# Import data
df_all_models = pd.read_csv('../data/cleaned_datasets/df_models.csv')

time_now = datetime.datetime.now()
df_models = df_all_models.dropna(subset=time_now.strftime(f"%Y-%m"))

# Calculate overall sales
df_models = df_models.merge(pd.DataFrame(return_date_col(df_models.set_index('style_id')).T.sum(),columns=['overall_sales']),left_on = 'style_id',right_index=True)

# Create new dataframe with yearly sales
yearly_sales = return_sales(df_models,'style_id').resample('Y',kind='period').sum().T

# Calculate the yearly average sales
avg_yearly_sales = yearly_sales[yearly_sales > 0].mean()

# Calculate the total normalized yearly sales (note this number is the total sales divided by the yearly average)
total_norm_sales = (yearly_sales/avg_yearly_sales).T.sum()

# Add this to the dataframe
df_models = df_models.merge(pd.DataFrame(total_norm_sales,columns=['total_norm_sales']),left_on = 'style_id',right_index=True)

In [57]:
# Specify the last month of sales
last_month_num = 4 # 4 = april

norm_sales = []
for i,x in enumerate(avg_yearly_sales.index):
    # Calculate number of years of sales for all except the last year
    years_of_sales = sum([1 for sales in yearly_sales.loc[x,yearly_sales.iloc[:,0:-1].columns] if sales > 0])
    if yearly_sales.iloc[i,-1] > 0:
        years_of_sales += last_month_num/12
    if years_of_sales == 0:
        norm_sales.append(0)
    else:
        # Use 2019 yearly average to convert normalized sales to yearly sales
        norm_sales.append(avg_yearly_sales['2019-12-31']*df_models.loc[i,'total_norm_sales']/years_of_sales)
yearly_sales['norm_sales'] = norm_sales

KeyError: Period('2015', 'A-DEC')

In [None]:
norm_sales = []
for i,x in enumerate(yearly_sales.index):
    years_of_sales = sum([1 for sales in yearly_sales.loc[x,yearly_sales.iloc[:,0:-1].columns] if sales > 0])
    if yearly_sales.iloc[i,-1] > 0:
        years_of_sales += 4/12
    if years_of_sales == 0:
        norm_sales.append(0)
    else:
        norm_sales.append(df_models.loc[i,'overall_sales']/years_of_sales)
yearly_sales['norm_sales'] = norm_sales

plt.title('Normalized Sales')
plt.hist(norm_sales)
plt.xlabel('Sales/years sold')
plt.ylabel('Num models');

df_models.merge(yearly_sales[['norm_sales']],left_on = 'style_id',right_index=True)

In [4]:
df_models

Unnamed: 0,style_id,max_cart_qty,availability,qty_on_hand,qty_on_order,qty_total_inv,qty_sold_last_6m,qty_avg/mo,months_of_supply,xyz_cost,...,2022-05,2022-06,2022-07,2022-08,2022-09,2022-10,2022-11,2022-12,overall_sales,norm_sales
0,am4141,10000,,,,,,,,45.15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,736.0,147.200000
1,am4183,10000,,,,,,,,45.15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,545.0,136.250000
2,am4481,no max quantity reported,,,,,,,,52.15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,489.0,122.250000
3,am4482,no max quantity reported,,,,,,,,57.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,303.0,101.000000
4,am4483,no max quantity reported,,,,,,,,59.15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,302.0,100.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1719,me3214,100,available,17.0,0.0,17.0,3.0,0.5,34.0,73.50,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,6.000000
1720,me3217,100,,,,,,,,98.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1721,me3218,100,,,,,,,,98.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1722,me3219,100,,,,,,,,91.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
