# Get the dataset ready

## 1. Import libs

In [1]:
import numpy as np
import pandas as pd
import datetime

import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
#import cudf      # RAPIDS cuDF : read df from GPU


import warnings
warnings.filterwarnings("ignore")

## 2. Check Result format
Check what the recommendation system's output looks like
- Recommend each customer with 12 items

In [2]:
submit = pd.read_csv('raw/sample_submission.csv') # customer_id & 12 x recommendations
# submit.head()

## 3. Memory tricks
    - deal with data size first
1. Resize the image to lower resolution(512 x 512)  
https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/306152
2. Reduce memory  
https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
3. Save to parquet to reduce time for reading files 

In [3]:
!ls

Customer Cleaning & EDA.ipynb [34mraw[m[m
H&M -Articles EDA.ipynb       script1_Prep_z11.ipynb
articles_20ss.parquet         script2_EDA_z11.ipynb
customers_20ss.parquet        transactions_20ss.parquet
[34mimages_512_512[m[m


> ### 3.1 Read Data

In [4]:
articles = pd.read_csv('raw/articles.csv')                   #    105,542 x 25
customers = pd.read_csv('raw/customers.csv')                 #  1,371,980 x 7
transactions = pd.read_csv('raw/transactions_train.csv')     # 31,788,324 x 5

> ### 3.2 Reduce memory

##### transactions_train.csv

In [5]:
# customer_id
transactions.customer_id = transactions.customer_id.apply(lambda x: int(x[-16:],16) ).astype('int64')

# t_dat
transactions.t_dat = pd.to_datetime(transactions.t_dat)
transactions['year'] = (transactions.t_dat.dt.year-2000).astype('int8')  # 2 digits indicating year
transactions['month'] = (transactions.t_dat.dt.month).astype('int8')
transactions['day'] = (transactions.t_dat.dt.day).astype('int8')
# del transactions['t_dat'] # for later use

# price
transactions.price = transactions.price.astype('float32')

# sales_channel_id
transactions.sales_channel_id = transactions.sales_channel_id.astype('int8')

##### customers.csv

In [6]:
# customer_id
customers.customer_id = customers.customer_id.apply(lambda x: int(x[-16:],16) ).astype('int64')

# postal_code
customers.postal_code = customers.postal_code.apply(lambda x: int(x[-16:],16) ).astype('int64')

> ### 3.3 Get 2020ss data

In [7]:
transactions_20ss = transactions[(transactions.year == 20) & (transactions.month.between(3,8))]
del transactions_20ss['year']

# filter the corresponding articles & customers records
mask = articles['article_id'].isin(transactions_20ss['article_id'])
articles_20ss = articles[mask]

mask = customers['customer_id'].isin(transactions_20ss['customer_id'])
customers_20ss = customers[mask]

> ### 3.4 Save as parquet files

In [8]:
articles_20ss.to_parquet('articles_20ss.parquet', index=False)
customers_20ss.to_parquet('customers_20ss.parquet', index=False)
transactions_20ss.to_parquet('transactions_20ss.parquet', index=False)