In [None]:
## Importing libraries

import pandas as pd
import numpy as np
from tqdm import tqdm
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

import warnings
warnings.filterwarnings('ignore')

Mounted at /content/drive


In [None]:
## Importing data

df_train = pd.read_csv('/content/drive/MyDrive/Data Science Project - Team D/data/raw-data/train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Data Science Project - Team D/data/raw-data/test.csv')
features = [col for col in df_train.columns if col.startswith("var")]

In [None]:
## Fake vs. Real data 

test = df_test.drop(['ID_code'], axis=1).values

unique_count = np.zeros_like(test)

for feature in range(test.shape[1]):
    _, index, count = np.unique(test[:, feature], return_counts=True, return_index=True)
    unique_count[index[count == 1], feature] += 1
    
real_samples = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
synth_samples = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]

df_all = pd.concat([df_train, df_test.iloc[real_samples]])

In [None]:
## Magic Features - Binary

df_all_copy = df_all
test_fake = df_test.iloc[synth_samples]

for feature in features:
    count = df_all_copy[feature].value_counts().to_dict()
    df_all_copy[feature+"_unique"] = df_all_copy[feature].apply(lambda x: 1 if count[x] == 1 else 0).values
    test_fake[feature+"_unique"] = 0 

df_train = df_all_copy[df_all_copy["ID_code"].str.contains("train")].copy()
test_real = df_all_copy[df_all_copy["ID_code"].str.contains("test")].copy()
test_real.drop(["target"], axis=1, inplace=True)
df_test = pd.concat([test_real, test_fake], sort=False).sort_index()

print('Training set shape after creating magic features: {}'.format(df_train.shape))
print('Test set shape after creating magic features: {}'.format(df_test.shape))


df_test.to_csv("/content/drive/MyDrive/Data Science Project - Team D/data/fe1-binary/test_fe1_binary.csv", index=False)
df_train.to_csv("/content/drive/MyDrive/Data Science Project - Team D/data/fe1-binary/train_fe1_binary.csv", index=False)

Training set shape after creating magic features: (200000, 402)
Test set shape after creating magic features: (200000, 401)


In [None]:
## Magic Features - Sum 

df_all_copy = df_all_copy

for feature in features:
    temp = df_all_copy[feature].value_counts(dropna=True)
    df_train[feature + 'vc'] = df_train[feature].map(temp).map(lambda x: min(10, x)).astype(np.uint8)
    df_test[feature + 'vc'] = df_test[feature].map(temp).map(lambda x: min(10, x)).astype(np.uint8)

for feature in features:
    df_train[feature + 'sum'] = ((df_train[feature] - df_all_copy[feature].mean()) * df_train[feature + 'vc'] \
                                 .map(lambda x: int(x > 1))).astype(np.float32)
    df_test[feature + 'sum'] = ((df_test[feature] - df_all_copy[feature].mean()) * df_test[feature + 'vc'] \
                                .map(lambda x: int(x > 1))).astype(np.float32) 

for feature in features:
    df_train[feature + 'sum2'] = ((df_train[feature]) * df_train[feature + 'vc'] \
                                  .map(lambda x: int(x > 2))).astype(np.float32)
    df_test[feature + 'sum2'] = ((df_test[feature]) * df_test[feature + 'vc'] \
                                 .map(lambda x: int(x > 2))).astype(np.float32)
for feature in features:
    df_train[feature + 'sum3'] = ((df_train[feature]) * df_train[feature + 'vc'] \
                                  .map(lambda x: int(x > 4))).astype(np.float32) 
    df_test[feature + 'sum3'] = ((df_test[feature]) * df_test[feature + 'vc'] \
                                 .map(lambda x: int(x > 4))).astype(np.float32)

print('Training set shape after creating magic features: {}'.format(df_train.shape))
print('Test set shape after creating magic features: {}'.format(df_test.shape))

df_train.to_csv("/content/drive/MyDrive/Data Science Project - Team D/data/fe1-sum/train_fe1_sum.csv", index=False)
df_test.to_csv("/content/drive/MyDrive/Data Science Project - Team D/data/fe1-sum/test_fe1_sum.csv", index=False)

In [None]:
df_train.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190sum3,var_191sum3,var_192sum3,var_193sum3,var_194sum3,var_195sum3,var_196sum3,var_197sum3,var_198sum3,var_199sum3
0,train_0,0.0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,0.0,3.9642,3.1364,0.0,18.522699,-2.3978,7.8784,8.5635,12.7803,-0.0
1,train_1,0.0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,0.0,0.0,0.0,0.0,8.7889,18.356001,0.0
2,train_2,0.0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,0.0,9.7905,1.6704,0.0,0.0,0.0,-0.0,8.2675,0.0,0.0
3,train_3,0.0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,0.0,0.0,0.7178,0.0,0.0,-1.2706,-2.9275,0.0,0.0,-0.0
4,train_4,0.0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-0.0,9.5214,-0.0,0.0,0.0,-1.5121,3.9267,9.5031,0.0,-0.0


In [None]:
## Magic Features - Categorical

data = pd.read_csv('/content/drive/MyDrive/Data Science Project - Team D/data/raw-data/train.csv')
etd = pd.read_csv('/content/drive/MyDrive/Data Science Project - Team D/data/raw-data/test.csv')

not_used = []
cat_feat = []
target = "target"
features = [i for i in data.columns if i != target]

orig = [f'var_{i}' for i in range(200)]
has_one = [f'var_{i}_has_one' for i in range(200)]
has_zero = [f'var_{i}_has_zero' for i in range(200)]
not_u = [f'var_{i}_not_unique' for i in range(200)]

for f in orig:
    unique_v = etd[f].value_counts()
    unique_v = unique_v.index[unique_v == 1]
    etd[f + '_u'] = etd[f].isin(unique_v)

etd['has_unique'] = etd[[f + '_u' for f in orig]].any(axis=1)
print(etd['has_unique'].sum())

real_samples = etd.loc[etd['has_unique'], orig]
ref = pd.concat([data, real_samples], axis=0)
print(ref.shape)

for f in orig:
    data[f + '_has_one'] = 0
    data[f + '_has_zero'] = 0
    f_1 = data.loc[data[target] == 1, f].value_counts()
    
    f_1_1 = set(f_1.index[f_1 > 1])
    f_0_1 = set(f_1.index[f_1 > 0])

    f_0 = data.loc[data[target] == 0, f].value_counts()
    f_0_0 = set(f_0.index[f_0 > 1])
    f_1_0 = set(f_0.index[f_0 > 0])
    
    data.loc[data[target] == 1, f + '_has_one'] = data.loc[data[target] == 1, f].isin(f_1_1).astype(int)
    data.loc[data[target] == 0, f + '_has_one'] = data.loc[data[target] == 0, f].isin(f_0_1).astype(int)

    data.loc[data[target] == 1, f + '_has_zero'] = data.loc[data[target] == 1, f].isin(f_1_0).astype(int)
    data.loc[data[target] == 0, f + '_has_zero'] = data.loc[data[target] == 0, f].isin(f_0_0).astype(int)

data.loc[:, has_one] = 2*data.loc[:, has_one].values + data.loc[:, has_zero].values

for f in orig:
    etd[f + '_has_one'] = 0
    etd[f + '_has_zero'] = 0
    f_1 = data.loc[data[target] == 1, f].unique()
    f_0 = data.loc[data[target] == 0, f].unique()
    etd.loc[:, f + '_has_one'] = etd[f].isin(f_1).astype(int)
    etd.loc[:, f + '_has_zero'] = etd[f].isin(f_0).astype(int)
    
etd.loc[:, has_one] = 2*etd.loc[:, has_one].values + etd.loc[:, has_zero].values

for f in orig:
    v = ref[f].value_counts()
    
    non_unique_v = v.index[v != 1]
    
    m_trd = data[f].isin(non_unique_v)
    data[f + '_not_unique'] = m_trd  * data[f] + (~m_trd) * data[f].mean()
    
    m_etd = etd[f].isin(non_unique_v)
    etd[f + '_not_unique'] = m_etd  * etd[f] + (~m_etd) * data[f].mean()
    
    data.loc[~m_trd, f + '_has_one'] = 4
    etd.loc[~m_etd, f + '_has_one'] = 4

data['var_0_has_one'].value_counts()

print('Training set shape after creating magic features: {}'.format(data.shape))
print('Test set shape after creating magic features: {}'.format(etd.shape))

data.to_csv("/content/drive/MyDrive/Data Science Project - Team D/data/fe1-categorical/train_fe1_categorical.csv", index=False)
etd.to_csv("/content/drive/MyDrive/Data Science Project - Team D/data/fe1-categorical/test_fe1_categorical.csv", index=False)

100000
(300000, 202)
Training set shape after creating magic features: (200000, 802)
Test set shape after creating magic features: (200000, 1002)
