## Capstone Part 3 - Preprocessing for LGBM Ranker model

In this notebook, I will attempt to use parquet and other techniques to reduce the file sizes for use in Light GBM Ranker.

In [18]:
# Import libraries
import pandas as pd
import numpy as np

In [2]:
# Read csv files
transactions = pd.read_csv('../datasets/transactions_train.csv', dtype={"article_id": "str"})
customers = pd.read_csv('../datasets/customers_cleaned.csv')
articles = pd.read_csv('../datasets/articles_cleaned.csv', dtype={"article_id": "str"})

## Steps below performed to save memory

In [3]:
# helper functions from following link:
# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

In [4]:
transactions['customer_id'] = customer_hex_id_to_int(transactions['customer_id'])

In [5]:
transactions.t_dat = pd.to_datetime(transactions.t_dat, format='%Y-%m-%d')

In [6]:
# Impute week column, // used to not have rounding but whole number for week
transactions['week'] = 104 - (transactions.t_dat.max() - transactions.t_dat).dt.days // 7

In [7]:
transactions.article_id = article_id_str_to_int(transactions.article_id)
articles.article_id = article_id_str_to_int(articles.article_id)

transactions.week = transactions.week.astype('int8')
transactions.sales_channel_id = transactions.sales_channel_id.astype('int8')
transactions.price = transactions.price.astype('float32')

In [8]:
customers.customer_id = customer_hex_id_to_int(customers.customer_id)
for col in ['FN', 'Active', 'age']:
    customers[col].fillna(-1, inplace=True)
    customers[col] = customers[col].astype('int8')

In [11]:
# helper function
from sklearn.base import BaseEstimator, TransformerMixin
class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)

In [12]:
customers.club_member_status = Categorize().fit_transform(customers[['club_member_status']]).club_member_status
customers.postal_code = Categorize().fit_transform(customers[['postal_code']]).postal_code
customers.fashion_news_frequency = Categorize().fit_transform(customers[['fashion_news_frequency']]).fashion_news_frequency

In [13]:
for col in articles.columns:
    if articles[col].dtype == 'object':
        articles[col] = Categorize().fit_transform(articles[[col]])[col]

In [14]:
for col in articles.columns:
    if articles[col].dtype == 'int64':
        articles[col] = articles[col].astype('int32')

In [15]:
transactions.sort_values(['t_dat', 'customer_id'], inplace=True)

In [16]:
transactions.tail()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
31774722,2020-09-22,18439937050817258297,891591003,0.084729,2,104
31774723,2020-09-22,18439937050817258297,869706005,0.084729,2,104
31779097,2020-09-22,18440902715633436014,918894002,0.016932,1,104
31779098,2020-09-22,18440902715633436014,761269001,0.016932,1,104
31780475,2020-09-22,18443633011701112574,914868002,0.033881,1,104


In [17]:
transactions_5w = transactions[transactions['week']>99]

In [None]:
transactions.to_parquet('../datasets/transactions_train.parquet')
transactions_5w.to_parquet('../datasets/transactions_5w_train.parquet')
customers.to_parquet('../datasets/customers.parquet')
articles.to_parquet('../datasets/articles.parquet')