In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import gc
from sklearn.preprocessing import LabelEncoder, PowerTransformer, QuantileTransformer, RobustScaler, StandardScaler

In [2]:
train = pd.read_parquet("./data/train.parquet")
test = pd.read_parquet("./data/test.parquet")

In [3]:
# train.isnull().mean().sort_values(ascending=False).head(60)

D_88     0.998915
D_110    0.994335
B_39     0.993920
D_73     0.989902
B_42     0.987078
D_134    0.964801
B_29     0.931046
D_132    0.901911
D_76     0.887462
D_42     0.856943
D_142    0.829266
D_53     0.738429
D_50     0.568097
B_17     0.567229
D_105    0.546228
D_56     0.540716
S_9      0.530357
D_77     0.454476
D_43     0.299812
S_27     0.253267
D_46     0.219056
S_7      0.184498
S_3      0.184498
D_62     0.137064
D_48     0.129934
D_61     0.108118
P_3      0.054505
D_69     0.035158
D_55     0.033409
D_115    0.031947
D_119    0.031947
D_118    0.031947
D_121    0.031947
R_27     0.023267
D_130    0.018358
D_141    0.018358
D_131    0.018358
D_104    0.018358
D_128    0.018358
B_13     0.008952
P_2      0.008313
D_133    0.007722
D_144    0.007363
D_102    0.007350
D_52     0.005345
B_8      0.004026
S_22     0.003439
S_24     0.003361
S_25     0.002323
B_15     0.001252
B_25     0.001252
D_112    0.000479
D_45     0.000365
B_19     0.000364
B_27     0.000364
B_26     0

In [3]:
train['set'] = 'train'
test['set'] = 'test'

In [4]:
df = pd.concat([train,test]).reset_index(drop=True)

In [5]:
del train, test
gc.collect()

21

In [6]:
df.shape

(16895213, 191)

In [7]:
features = df.columns[2:-1]
cat_features = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
dense_features = [col for col in features if col not in cat_features]

In [11]:
df[dense_features[101]] = df[dense_features[101]].replace(-1,0)

In [9]:
for col in tqdm(cat_features):
    df[col] = LabelEncoder().fit_transform(df[col])

  0%|          | 0/11 [00:00<?, ?it/s]

In [None]:
int_col = (df[dense_features] == -1).sum() > 0
int_col = int_col[int_col].index
df[int_col] = df[int_col].replace(-1,np.nan).astype(np.float32)

In [None]:
df[dense_features] = df[dense_features].astype(np.float32)

In [None]:
df.shape

In [13]:
for col in tqdm(dense_features):
    QT = QuantileTransformer(n_quantiles=1000,output_distribution='normal', subsample=1_000_000)
    df[[col]] = QT.fit_transform(df[[col]])

In [14]:
# for col in tqdm(dense_features):
#     RS = RobustScaler()
#     df[[col]] = RS.fit_transform(df[[col]])

In [None]:
# for col in tqdm(dense_features):
#     SS = StandardScaler()
#     df[[col]] = SS.fit_transform(df[[col]])

  0%|          | 0/177 [00:00<?, ?it/s]

In [9]:
features = cat_features + dense_features

In [10]:
df['customer_ID'].nunique()

1383534

In [11]:
len(features)

188

In [13]:
df[features].abs().max().sort_values()

B_8        1.001572
D_104      1.001982
D_56       1.002011
D_77       1.014382
D_105      1.016661
            ...    
S_23      85.829074
B_37      90.192801
B_40      90.629014
R_23      93.443066
R_18     123.825999
Length: 188, dtype: float64

In [None]:
train_labels = pd.read_csv("./data/train_labels.csv",index_col=0)['target']

In [14]:
X = np.full([1383534,13,188],np.nan,dtype=np.float32)
y = np.full(1383534,-1,dtype=np.int8)

In [15]:
customer_list = []
is_train = []
for i, (cus, d) in tqdm(enumerate(df.groupby('customer_ID')),total=1383534):
    if cus in train_labels.index:
        y[i] = train_labels.loc[cus]
        is_train.append(True)
    else:
        is_train.append(False)
    X[i,-len(d):] = d[features].values
    customer_list.append(cus)
customer_list = np.array(customer_list)
is_train = np.array(is_train)

  0%|          | 0/1383534 [00:00<?, ?it/s]

In [16]:
np.save("./data/X2", X)
# np.save("./data/X_freq", X)
np.save("./data/y", y)
np.save("./data/customer_list", customer_list)
np.save("./data/is_train", is_train)