# data-preprocessing

Preprocessing procedures for raw data.

Requires:
1. `data/raw/raiffeisen_data_anonymized.csv.zip`

Produces:
1. `data/mcc2id.json`
2. `data/x_train.npy`
3. `data/y_train.npy`
4. `data/x_test.npy`
5. `data/y_test.npy`

In [1]:
import sys
sys.path.append('..')

In [2]:
from zipfile import ZipFile
from datetime import datetime

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

from myutils.json import save_json

from config import raw_data_zip_fpath, x_train_fpath, y_train_fpath, x_test_fpath, y_test_fpath, mcc2id_fpath

In [3]:
with ZipFile(raw_data_zip_fpath) as zipfile:
    namelist = zipfile.namelist()
    assert len(namelist) == 1
    
    with zipfile.open(namelist[0]) as file:
        df = pd.read_csv(
            file,
            usecols=['customer_id', 'mcc', 'transaction_date'],
            dtype={'mcc': str},
            parse_dates=['transaction_date'],
        )

df['month'] = df['transaction_date'].apply(lambda dt: dt.replace(day=1).timestamp())
df = df.drop(columns=['transaction_date'])
        
df.head(2)

Unnamed: 0,customer_id,mcc,month
0,0dc0137d280a2a82d2dc89282450ff1b,5261,1498867000.0
1,0dc0137d280a2a82d2dc89282450ff1b,5261,1504224000.0


In [4]:
df.nunique()

customer_id    9988
mcc             243
month             9
dtype: int64

In [5]:
customers = df['customer_id'].unique().tolist()
n_customers = len(customers)
customer2id = {customer: i for i, customer in enumerate(customers)}

months = sorted(df['month'].unique().tolist())
n_months = len(months)
month2id = {month: i for i, month in enumerate(months)}

mccs = df['mcc'].unique().tolist()
n_mccs = len(mccs)
mcc2id = {mcc: i for i, mcc in enumerate(mccs)}

In [6]:
save_json(mcc2id, mcc2id_fpath)

Representing data as a rank-3 tensor: `data[i, j, k]` is number of $i$-th customer's transactions during $j$-th month with $k$-th MCC.

_Note_: I'm using `uint8` here for storage reasons and since I've previously checked that the largest value here is 140.

In [7]:
data = np.zeros((n_customers, n_months, n_mccs), dtype=np.uint8)

for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    i = customer2id[row['customer_id']]
    j = month2id[row['month']]
    k = mcc2id[row['mcc']]
    
    data[i, j, k] += 1
    
data.shape

  0%|          | 0/1047190 [00:00<?, ?it/s]

(9988, 9, 243)

### Constructing sequences

In [8]:
seq = 5  # sequence length in months
ntest = 1  # number of last months to test on

In [9]:
x_train = []
y_train = []
x_test = []
y_test = []

for mx in tqdm(data):  # for each customer
    for i in range(n_months - seq):
        j = i + seq
        x = mx[i:j]
        y = mx[j]
        
        if (x == 0).all() or (y == 0).all():
            continue
            
        if n_months - j > ntest:
            x_train.append(x)
            y_train.append(y)
        
        else:
            x_test.append(x)
            y_test.append(y)
            
x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)

x_train.shape, y_train.shape, x_test.shape, y_test.shape

  0%|          | 0/9988 [00:00<?, ?it/s]

((26038, 5, 243), (26038, 243), (8560, 5, 243), (8560, 243))

In [10]:
np.save(x_train_fpath, x_train)
np.save(y_train_fpath, y_train)
np.save(x_test_fpath, x_test)
np.save(y_test_fpath, y_test)