In [2]:
import os
import torch
import numpy as np
import pandas as pd
from scipy import stats
from scipy.sparse import coo_matrix
from sklearn.preprocessing import StandardScaler

path = '../data/tbrain_cc_training_48tags_hash_final.csv'
df = pd.read_csv(path)

pred_label = [ 2,  6, 10, 12, 13, 15, 18, 19, 21, 22, 25, 26, 36, 37, 39, 48]
df.shop_tag = df.shop_tag.str.replace('other', '0').astype(int)
df.reset_index(drop=True, inplace=True)
print(df.shape)

chid_feat_cols = ['masts', 'educd', 'trdtp', 'naty', 'poscd', 'cuorg', 'gender_code', 'age']
edge_feat_cols = list(df.columns)[3:-10]
edge_cols = ['chid', 'shop_tag']
y_cols = ['txn_amt']

df.loc[2417847, 'txn_amt'] /= 1e46 # let max value be smaller

(32975653, 53)


In [3]:
chid2idx = {chid: i for i, chid in enumerate(sorted(df.chid.unique()))}
shop2idx = {shop: i for i, shop in enumerate(sorted(df.shop_tag.unique()))}
feat2idx = {}
for feat in chid_feat_cols:
    df[feat] = df[feat].fillna(df[feat].max()+1).astype(int)
    feat2idx[feat] = {f: i for i, f in enumerate(sorted(df[feat].unique()))}

df.chid = df.chid.map(chid2idx.get)
df.shop_tag = df.shop_tag.map(shop2idx.get)
df[chid_feat_cols] = df[chid_feat_cols].apply(lambda x: x.map(feat2idx[x.name].get))

bcx_target, lam = stats.boxcox(df.txn_amt)
df['txn_amt'] = bcx_target

In [4]:
scaler = StandardScaler()
df[['txn_cnt']] = scaler.fit_transform(df[['txn_cnt']])

In [5]:
np.save('../data/chid2idx', chid2idx)
np.save('../data/shop2idx', shop2idx)
np.save('../data/feat2idx', feat2idx)

In [6]:
edge_cols = ['chid', 'shop_tag']
feat_cols = ['txn_cnt', 'domestic_offline_cnt','domestic_online_cnt','overseas_offline_cnt','overseas_online_cnt', 
             'domestic_offline_amt_pct', 'domestic_online_amt_pct', 'overseas_offline_amt_pct', 'overseas_online_amt_pct', 
             'card_1_txn_cnt', 'card_2_txn_cnt', 'card_3_txn_cnt', 'card_4_txn_cnt', 'card_5_txn_cnt', 'card_6_txn_cnt', 'card_7_txn_cnt', 'card_8_txn_cnt', 
             'card_9_txn_cnt', 'card_10_txn_cnt', 'card_11_txn_cnt', 'card_12_txn_cnt', 'card_13_txn_cnt', 'card_14_txn_cnt', 'card_other_txn_cnt', 
             'card_1_txn_amt_pct', 'card_2_txn_amt_pct', 'card_3_txn_amt_pct', 'card_4_txn_amt_pct', 'card_5_txn_amt_pct', 'card_6_txn_amt_pct', 'card_7_txn_amt_pct', 'card_8_txn_amt_pct',
             'card_9_txn_amt_pct', 'card_10_txn_amt_pct', 'card_11_txn_amt_pct', 'card_12_txn_amt_pct', 'card_13_txn_amt_pct', 'card_14_txn_amt_pct', 'card_other_txn_amt_pct']
             # 1, 4, 4, 15, 15

In [7]:
other_label = list(set(df.shop_tag.unique()) - set(pred_label))

In [8]:
out_path = '../data/sparse/'
os.makedirs(out_path, exist_ok=True)

for dt in sorted(df.dt.unique()):
    values = df[(df.dt == dt)&(df.shop_tag.isin(pred_label))].sort_values(by=['chid', 'shop_tag'])[edge_cols+feat_cols+['txn_amt']].values
    indices = values[:, :2].T
    indices[1] = list(map(lambda x:pred_label.index(x), indices[1].tolist()))

    data_amt = values[:, -1] # txn_amt
    torch.save(torch.sparse_coo_tensor(indices, data_amt, [len(chid2idx), len(pred_label)]), out_path+'x_{:02d}.pt'.format(dt))

    data_feat = values[:,2:-1] # txn_cnt
    torch.save(torch.sparse_coo_tensor(indices, data_feat, [len(chid2idx), len(pred_label), len(feat_cols)]), out_path+'feat_{:02d}.pt'.format(dt))

    x = coo_matrix((data_amt, indices), shape=[len(chid2idx), len(pred_label)])
    x_values = x.toarray()
    indices = [np.array([[i, i, i] for i in range(len(chid2idx))]).ravel(), np.argpartition(x_values, -3)[:, -3:].ravel()]
    data = np.ones(indices[0].shape[0])*np.array([0.95, 0.975, 1.]*len(chid2idx))
    data[x_values[indices[0], indices[1]] == 0] = 0
    torch.save(torch.sparse_coo_tensor(indices, data, [len(chid2idx), len(pred_label)]), out_path+'top3_{:02d}.pt'.format(dt))

    values = df[(df.dt == dt)&(df.shop_tag.isin(other_label))].sort_values(by=['chid', 'shop_tag'])[edge_cols+['txn_amt']].values
    indices = values[:, :2].T
    indices[1] = list(map(lambda x:other_label.index(x), indices[1].tolist()))

    data = values[:, -1] # txn_amt
    torch.save(torch.sparse_coo_tensor(indices, data, [len(chid2idx), len(other_label)]), out_path+'other_{:02d}.pt'.format(dt))    

In [9]:
df_user_feat = df[['chid']+chid_feat_cols+['slam']].drop_duplicates(ignore_index=True).copy()
df_user_feat = df_user_feat.iloc[df_user_feat[['chid']].drop_duplicates(keep='last').index].sort_values(by='chid', ignore_index=True)
df_user_feat.slam.fillna(df_user_feat.slam.median(), inplace=True)
df_user_feat.slam = np.log10(df_user_feat.slam)

In [10]:
x = torch.LongTensor(df_user_feat.values[:, 1:-1])
torch.save(x,  out_path+'user_sparse.pt')

x = torch.FloatTensor(df_user_feat.values[:, [-1]])
torch.save(x,  out_path+'user_dense.pt')