In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import category_encoders as ce

from feature_engine.creation import CyclicalTransformer
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import PowerTransformer

In [None]:
DATA_PATH = 'data/data_after_EDA.csv'
START_DATE = '01/25/2019'
END_DATE = '04/24/2022'

# Display all of the columns when data are shown
pd.set_option('display.max_columns', 60) 

In [None]:
data =  pd.read_csv(DATA_PATH, sep=',', parse_dates=['doc_date'], low_memory=False)

In [None]:
data.head()

In [None]:
data.isna().sum()

# 1. Change non-numeric values to numbers

Machine learning models usually work only with numeric values (integers or floats) - that's why we need to change other formats to numbers. 

In [None]:
data.dtypes

At first let's start with breaking down dates to four different columns - we can extract day of the month, day of the week, week, month and year. We will still keep the original datetime column in code, because it can be useful to easier access date (rather then creating it from columns).

In [None]:
years, months, days, weeks, weekdays = [], [], [], [], []
for date in data['doc_date']:
    years.append(date.year)
    months.append(date.month)
    days.append(date.day)
    weekdays.append(date.weekday())
    weeks.append(date.week)
    
    

data['doc_day'] = days
data['doc_month'] = months
data['doc_year'] = years
data['doc_weekday'] = weekdays
data['doc_week'] = weeks

**The next part is to find columns that already have their natural number representation - i.e. product_name_parameterize is not necesarry column as we have product_id (numeric products identification)**

In [None]:
# rename catalog_COLUMN to COLUMN only so it is easier to understand
data.rename(columns={'catalog_category_id' : 'category_id', 'catalog_segment_id' : 'segment_id', 'catalog_brand_id' : 'brand_id'}, inplace=True)

# rename other id columns with extra words to pure defining id in similar spirit as with catalog
data.rename(columns={'setting_currency_id' : 'currency_id', 'shop_basket_id' : 'basket_id'}, inplace=True)

In [None]:
def leave_only_id_column(df : pd.DataFrame(), id_column : str, other_columns : list, inplace : bool = False) -> pd.DataFrame():
    """
    Function that counts and compares if product_id is proper representation of other given columns. If yes, then drop other columns and leave id only.
    Args
        df - pandas DataFrame containing desired columns
        id_column - main column containing identificator, this column will be the only one remaining
        other_columns - list of other columns, those will be compared and possibly dropped
        inplace - If False, return a copy. Otherwise, do operation inplace and return None.
    Returns
        pd.DataFrame - DataFrame with removed columns in other_columns or None if inplace is True
    """
    id_col_len = len(data[id_column].unique())
    
    unique_combinations = len(df[other_columns + [id_column]].drop_duplicates().index)
    other_cols_string = ''
    for name in other_columns:
        other_cols_string += name+', '
    
    print(f"{id_col_len} - Unique {id_column} amount.")
    print(f"{unique_combinations} - Amount of unique combinations of {id_column} and {other_cols_string}")
    
    missmatches_amount = abs(id_col_len - unique_combinations)
    print(f"{missmatches_amount} - How many missmatches between {id_column} and other columns.")
    
    if missmatches_amount == 0:
        if inplace:
            df.drop(labels=other_columns, inplace=inplace, axis=1)
            return None
        else:
            return df.drop(labels=other_columns, inplace=inplace, axis=1)
        
    else: 
        print('There were missmatches, not dropping any columns.')
        return None

Because each product id represents one product correctly, we can drop product name as well as parameterized product name. \
We can drop product_code as well for the same reason - product id represents same products as product_code but in different encodings.

In [None]:
leave_only_id_column(data, 'product_id', ['product_name', 'product_code'], inplace=True)
print('\n')

leave_only_id_column(data, 'product_id', ['product_name_parameterize'], inplace=True)

In [None]:
print(data[['product_id' , 'product_name_parameterize']].drop_duplicates().product_id.value_counts().head(10))
data[data[['product_id' , 'product_name_parameterize', 'item_type']].product_id.__eq__(147573)][['product_name_parameterize']].drop_duplicates()

In [None]:
data.drop(labels='product_name_parameterize', inplace=True, axis=1)

Product name parameterize has 7 different than unique values. If we look deeper into it we can see that it is only because there is -set added to the end of the name parameterized. This is deprecated way of set selling, since there is now column (item_type) to differentiate between sets and standard items. That's why we can drop product_name_parametereize as well.

Similar to products, there is many alike records in data (columns represented by other column), we can take care of all of them. 

In [None]:
leave_only_id_column(data, 'category_id', ['category', 'category_name_parameterized'], inplace=True)

In [None]:
leave_only_id_column(data, 'brand_id', ['brand_name', 'brand_parameterized'], inplace=True)

In [None]:
leave_only_id_column(data, 'currency_id', ['original_currency_code'], inplace=True)

In [None]:
leave_only_id_column(data, 'segment_id', ['segment_parameterized', 'segment_name'], inplace=True)

In [None]:
leave_only_id_column(data, 'tree_path', ['category_full_name_path'], inplace=True)

In [None]:
# In tree path we want to keep it separated in two columns for now - categorty_descendants (parents) and category_ancestors (subcategories). We will check for missmatches and drop tree_path if there are none
leave_only_id_column(data, 'tree_path', ['categories_descendant_ids', 'categories_ancestor_ids'])

data.drop(labels='tree_path', axis=1, inplace=True)

**Coding remaining string and boolean values to numerics** \
With the usage of replace (booleans) and OrdinalEncounter (strings) we will change values to their representation in numbers.

In [None]:
data.replace([True, False], [1, 0], inplace=True)

In [None]:
columns_to_change = ['bill_country',
 'basket_type',
 'item_type',
 'product_status',
 'category_status']

ce_ordinal = ce.OrdinalEncoder(cols=columns_to_change)
data = ce_ordinal.fit_transform(data)

for mapped in ce_ordinal.fit(data).mapping:
    print(f"Column {mapped['col']} has mapping of:")
    print(f"{mapped['mapping']} \n\n")

Instead of saving full path to category we just want to know how deep given category is and how many subcategories it has. We will convert arrays of ancestors/ descendants into numbers representing amounts of ids in given lists.

In [None]:
ancestor_count = [len(i.split(',')) for i in data.categories_ancestor_ids]
descendant_count = [len(i.split(',')) for i in data.categories_descendant_ids]

data['ancestor_count'] = ancestor_count
data['descendant_count'] = descendant_count

data.ancestor_count.value_counts(normalize=True).sort_index().plot(kind='barh', title='Ancestor count share')
plt.show()

data.descendant_count.value_counts(normalize=True).sort_index().plot(kind='barh', title='Descendant count share')
plt.show()

data.drop(labels=['categories_ancestor_ids', 'categories_descendant_ids'], axis=1, inplace=True)

In [None]:
data.columns

# 2. Dealing with outliers

In this part we want to delete outliers, as those might negatively influenece machine learning algorithm. That is why we want to delete at least the first iteration of outliers. There is ~5% values as outliers in the first iteration, which, we consider, is reasonable price to pay for cleaner and more useful data.\
We are considering values further than *3x standard deviations ($\sigma$) from the mean ($\mu$)* as outliers in our preprocessing.

In [None]:
 def delete_outliers(df : pd.DataFrame) -> pd.DataFrame:
    """
    Function deletes rows containing outlier value in any of the columns and returns adjusted dataframe
    Args
        df - dataframe containing columns to check for outliers
    Returns
        DataFrame without outlier values
    """
    for cols in df.columns:    
        # Check for each column in the dataframe    
        data_frame = df[cols]
        data_mean, data_std = np.mean(data_frame), np.std(data_frame)  # Outlier > mean+3*std OR outlier < mean-3*std

        # Outliers percentage definition
        cut_off = data_std * 3
        lower, upper = data_mean - cut_off, data_mean + cut_off 

        # Identify and remove outliers
        outliers = [False if x < lower or x > upper else True for x in data_frame] 
            
        # Information for the user about deleting rows based on given column
        if outliers.count(False) > 0:
            print(f'Identified outliers: {outliers.count(False)} in column: {cols}')
        df = df[outliers]

    return df

In [None]:
check_outliers_columns = ['basket_total_price_with_vat', 
                          'count_basket_items', 
                          'basket_count_products', 
                          'item_quantity', 
                          'item_unit_price_with_vat', 
                          'item_total_discount_with_vat', 
                          'reviews_count', 
                          'reviews_average_score_price', 
                          'reviews_average_score_quality', 
                          'reviews_average_score_properties', 
                          'reviews_average_score_overall',
                          'reviews_average_score', 
                          'product_purchase_price',
                          'eshop_stock_count', 
                          'ancestor_count', 
                          'descendant_count']

In [None]:
delete_outliers(data[['basket_total_price_with_vat']])

In [None]:
for col in check_outliers_columns:
    data[col] = delete_outliers(data[[col]])
    data.dropna(inplace=True)

# 3. Normalization

In [None]:
min_max_columns = [''


']

scaler = MinMaxScaler()
scaled = scaler.fit_transform(data[scaled_columns])
scaled = pd.DataFrame(scaled, columns = scaled_columns)

In [None]:
data

In [None]:
TODO_TRANSFORM = list(data.columns)
TODO_TRANSFORM.remove('doc_day')
TODO_TRANSFORM.remove('doc_week')
TODO_TRANSFORM.remove('doc_year')
TODO_TRANSFORM.remove('doc_weekday')
TODO_TRANSFORM.remove('doc_month')
TODO_TRANSFORM.remove('doc_date')
TODO_TRANSFORM.remove('is_in_stock')
TODO_TRANSFORM.remove('is_ended')
TODO_TRANSFORM.remove('is_new')
TODO_TRANSFORM.remove('is_fifo')
TODO_TRANSFORM

In [None]:
cyclical = CyclicalTransformer(variables=['doc_day', 'doc_week', 'doc_weekday', 'doc_month'], drop_original=True)
cyclical.fit_transform(data)

# 4. Clustering

In [None]:
# K-MEANS CLUSTERING
# Importing Modules
from sklearn import datasets
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering

In [None]:
list(data.columns)

In [None]:
kmeansable = data[[
 'item_type',
 'product_id',
 'category_id',
 'brand_id',
 'product_status',
 'reviews_count',
 'reviews_average_score_price',
 'reviews_average_score_quality',
 'reviews_average_score_properties',
 'reviews_average_score_overall',
 'reviews_average_score',
 'is_in_stock',
 'is_ended',
 'is_new',
 'product_purchase_price',
 'eshop_stock_count',
 'is_fifo',
 'category_status',
 'segment_id',
 'default_warranty_period',
 'ancestor_count',
 'descendant_count']].drop_duplicates()

At first we will use kMeans clustering, as agglomerative clustering can be done on large dataset easier than other types.

In [None]:
# Declaring Model
model = KMeans(n_clusters=101)

# Fitting Model
model.fit(kmeansable)

# Prediction on the entire data
all_predictions = model.predict(kmeansable)

kmeansable['kmeans_cluster'] = all_predictions

In [None]:
unique, counts = np.unique(all_predictions, return_counts = True)
plt.bar(unique, counts)

In [None]:
cluster_tester = kmeansable[kmeansable.kmeans_cluster.__eq__(2)]

In [None]:
# Import the fcluster and linkage functions
from scipy.cluster.hierarchy import fcluster, linkage
 
# Use the linkage() function
distance_matrix = linkage(cluster_tester, method = 'ward', metric = 'euclidean')
                                      
tmp = fcluster(distance_matrix, 3, criterion='maxclust')

unique, counts = np.unique(tmp, return_counts = True)
plt.bar(unique, counts)
plt.plot()
cluster_tester['cluster_hierar'] = tmp

In [None]:
cluster_tester = cluster_tester[cluster_tester.cluster_hierar.__eq__(1)]

In [None]:
test_predict = data[data.product_id.isin(cluster_tester.product_id.values)]

# 5. Prediction

#### Drop unsettable columns
We want to predict, how many items of given type we will sell. That means - the predicted column will be 'item_quantity'. We don't know, how many items will be in basket, it's type etc. We only know values of columns we can influence (or which are already written) - for example price of the product, or it's brand. These columns were great for clustering products, as they can show some patterns in them, but for pure pretictions we are dropping those data.

In [None]:
def fill_zeroes(df : pd.DataFrame) -> pd.DataFrame:
    # Add days when there were no products sold
    date_column = 'doc_date'
    dates = pd.date_range(start=START_DATE, end=END_DATE)

    sales = []
    for date in dates:
        # if there is existing number of sales for given day, otherwise we know there are no such occurances
        if len(df[df[date_column] == date]['item_quantity']) > 0:
            sales.append(df[df[date_column] == date]['item_quantity'].sum())
        else:
            sales.append(0)

    full_sales = pd.DataFrame()        
    full_sales['invoice_date'] = dates
    full_sales['quantity_sold'] = sales
    return full_sales

In [None]:
fill_zeroes(test_predict[test_predict.product_id.__eq__(197899)])

In [None]:
data.drop(labels=['basket_id', 'basket_total_price_with_vat', 'basket_count_products', 'basket_type', 'count_basket_items'], axis=1, inplace=True)
test_predict.drop(labels=['basket_id', 'basket_total_price_with_vat', 'basket_count_products', 'basket_type', 'count_basket_items'], axis=1, inplace=True)

In [None]:
from catboost import CatBoostRegressor
from catboost import Pool

In [None]:
test_predict

In [None]:
train_predict = test_predict[9500:]
test_predict = test_predict[:9500]

In [None]:
model = CatBoostRegressor(
    iterations=1000, 
    learning_rate=0.02, 
    max_depth=10, 
    l2_leaf_reg=10, 
    loss_function='RMSE',
    random_seed=1,
    od_type='Iter',
    od_wait=25,
    verbose=100,
    use_best_model=False
    )

In [None]:
# separated_values.columns[1:] means all but the first column == quantity_sold
model.fit(test_predict.drop(labels=['doc_date', 'item_quantity'], axis=1), test_predict['item_quantity'],
              early_stopping_rounds=3000,
              verbose=100)

In [None]:
predicted = [model.predict(test_predict.drop(labels=['doc_date', 'item_quantity'], axis=1).iloc[i]) for i in range (len(test_predict.index))]

In [None]:
orig = list(test_predict.item_quantity.values)

In [None]:
sum(orig)

In [None]:
sum(predicted)

In [None]:
pred_test = [model.predict(train_predict.drop(labels=['doc_date', 'item_quantity'], axis=1).iloc[i]) for i in range (len(train_predict.index))]

In [None]:
orig_test = list(train_predict.item_quantity.values)

In [None]:
sum(pred_test)

In [None]:
sum(orig_test)

In [None]:
1447 / 1450

In [None]:
plt.figure(figsize=(20, 8))
test_predict.doc_date.value_counts().plot(xlabel='Date of sale', ylabel='Amount of orders', title = 'Amount of orders at any given day')
plt.show()

In [None]:
#indexed_predictions = pd.DataFrame()
#indexed_predictions['quantity_sold'] = predict_sold_test
#indexed_predictions.index = test.index

# IDEAS

In [None]:
"""
TODO 
OPIS TRANSOFMEROV MinMax a Cyclical
OPIS A UPRATANIE CLUSTERINGU


TODO 
preorbit kolacove grafy na boxploty, bar
"""

In [None]:
"""
IDEAS
PRIDAJ ROZBITIE NA TRENDY DO Samotneho vyhodnocovania
NAPOCITANIE PREDAJOV PRODUKTOV ZA TYZDEN????


NEDAVAJ PIE GRAFY DO PRACE!!!!
"""

In [None]:
tmp = data[['product_id', 'doc_week', 'doc_year', 'item_quantity']].groupby(by=['doc_year', 'doc_week', 'product_id'], as_index=False).sum()

In [None]:
data[['doc_week', 'doc_year']].drop_duplicates()

In [None]:
plt.rcParams['figure.figsize'] = (16, 8)
plt.rcParams['figure.dpi'] = 80

unique, counts = np.unique(tmp.product_id.value_counts().values, return_counts=True)

In [None]:
plt.bar(unique, counts)