In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

import tqdm
import pathlib

from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
df_people = pd.read_csv('red_hat/people.csv')
df_act = pd.read_csv('red_hat/act_train.csv')
print(df_people.shape, df_act.shape)

(189118, 41) (2197291, 15)


Filter people whose outcomes have both 1 and 0.

In [3]:
outcom_one = df_act[df_act['outcome'] == 1].people_id.unique()
outcom_zero = df_act[df_act['outcome'] == 0].people_id.unique()
outcom_one = set(outcom_one)
outcom_zero = set(outcom_zero)
outcom_0_1 = outcom_one & outcom_zero
keep_people = (outcom_one | outcom_zero) - (outcom_one & outcom_zero)
# df_people = df_people[df_people.people_id.isin(keep_people)]
df_act = df_act[df_act.people_id.isin(keep_people)]
print(df_people.shape, df_act.shape)

(189118, 41) (1889213, 15)


Encode the top-k frequent categorical features.

In [4]:
k = 10
categories = []
for i in range(3, 14):
    col = df_act.iloc[:, i]
    vcounts = col.value_counts().index.tolist()[:k]
    if col.isnull().any():
        vcounts.append('__NAN__')
    categories.append(vcounts)

df_act_filled = df_act.fillna('__NAN__')

In [5]:
encoder = OrdinalEncoder(categories=categories, dtype=int, handle_unknown='use_encoded_value', unknown_value=-1)
encoded_arr = encoder.fit_transform(df_act_filled.iloc[:, 3:14])

df_act_filled.iloc[:, 3:14] = encoded_arr + 1

Encode date

In [6]:
date = pd.to_datetime(df_act_filled.date)
delta = (date - date.min())
df_act_filled.date = delta.dt.days

Drop column

In [7]:
df_act_filled.drop('activity_id', axis=1, inplace=True)

Groupby and Save

In [8]:
path = pathlib.Path(f'red_hat/red_hat_{k}')
path.mkdir(exist_ok=True)
record_path = path / 'records'
record_path.mkdir(exist_ok=True)

In [9]:
outcomes = {}
for uid, subdf in tqdm.tqdm(df_act_filled.groupby('people_id')):
    outcomes[uid] = subdf.outcome.iloc[0]
    subdf = subdf.iloc[:, 1:-1]
    subdf.to_csv(record_path / f'{uid}.csv', header=True, index=False)

100%|█████████████████████████████████████████████████████████████████████████| 144639/144639 [00:54<00:00, 2646.93it/s]


Save outcomes (groudtruth)

In [10]:
outcomes = pd.Series(outcomes)

In [11]:
outcomes.to_csv(path / 'groundtruth.csv', header=None, index=True)

Save infos

In [15]:
nuniques = df_act_filled.max().iloc[2:-1] + 1
# assert (nuniques == (df_act_filled.max().iloc[2:-1] + 1)).all()

In [16]:
lines = [['date', 'num', '']] + [[col, 'cat', k] for col, k in nuniques.iteritems()]

df_info = pd.DataFrame(lines)
df_info.to_csv(path / 'info.csv', header=None, index=False)

Save categories

In [17]:
lines = [name + ',' + ','.join(cats) for name, cats in zip(encoder.feature_names_in_, encoder.categories_)]

In [18]:
with open(path / 'categories.txt', 'w') as fh:
    fh.write('\n'.join(lines))