In [1]:
## Importing libraries

import pandas as pd
import numpy as np
from tqdm import tqdm
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

import warnings
warnings.filterwarnings('ignore')

Mounted at /content/drive


In [None]:
##### The following codes were inspired by the following kernels:
######### 1/ Raghuram Nandepu ("https://medium.com/analytics-vidhya/santander-customer-transaction-prediction-an-end-to-end-machine-learning-project-2cb763172f8a") - primarily the "sum" section 
######### 2/ YAG320 ("https://www.kaggle.com/code/yag320/list-of-fake-samples-and-public-private-lb-split/notebook") - primarily the separation between fake and real data in the train set

### Binary

In [None]:
## Importing data

df_train = pd.read_csv('/content/drive/MyDrive/Data Science Project - Team D/data/raw-data/train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Data Science Project - Team D/data/raw-data/test.csv')

features = [col for col in df_train.columns if col.startswith("var")]

In [None]:
## Fake vs. Real data 

test = df_test.drop(['ID_code'], axis=1).values

unique_count = np.zeros_like(test)

for feature in range(test.shape[1]):
    _, index, count = np.unique(test[:, feature], return_counts=True, return_index=True)
    unique_count[index[count == 1], feature] += 1
    
real_samples = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
synth_samples = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]

df_all = pd.concat([df_train, df_test.iloc[real_samples]])

In [None]:
## Magic Features - Binary

df_all_copy = df_all.copy()
test_fake = df_test.iloc[synth_samples]

for feature in features:
    count = df_all_copy[feature].value_counts().to_dict()
    df_all_copy[feature+"_unique"] = df_all_copy[feature].apply(lambda x: 1 if count[x] == 1 else 0).values
    test_fake[feature+"_unique"] = 0 

df_train = df_all_copy[df_all_copy["ID_code"].str.contains("train")].copy()
test_real = df_all_copy[df_all_copy["ID_code"].str.contains("test")].copy()
test_real.drop(["target"], axis=1, inplace=True)
df_test = pd.concat([test_real, test_fake], sort=False).sort_index()

print('Training set shape after creating magic features: {}'.format(df_train.shape))
print('Test set shape after creating magic features: {}'.format(df_test.shape))


df_test.to_csv("/content/drive/MyDrive/Data Science Project - Team D/data/fe1-binary/test_fe1_binary.csv", index=False)
df_train.to_csv("/content/drive/MyDrive/Data Science Project - Team D/data/fe1-binary/train_fe1_binary.csv", index=False)

Training set shape after creating magic features: (200000, 402)
Test set shape after creating magic features: (200000, 401)


### Sum

In [None]:
## Importing data

df_train = pd.read_csv('/content/drive/MyDrive/Data Science Project - Team D/data/raw-data/train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Data Science Project - Team D/data/raw-data/test.csv')

features = [col for col in df_train.columns if col.startswith("var")]

In [None]:
## Fake vs. Real data 

test = df_test.drop(['ID_code'], axis=1).values

unique_count = np.zeros_like(test)

for feature in range(test.shape[1]):
    _, index, count = np.unique(test[:, feature], return_counts=True, return_index=True)
    unique_count[index[count == 1], feature] += 1
    
real_samples = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
synth_samples = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]

df_all = pd.concat([df_train, df_test.iloc[real_samples]])

In [None]:
df_all_copy = df_all.copy()

for feature in features:
    temp = df_all_copy[feature].value_counts(dropna=True)
    df_train[feature + 'vc'] = df_train[feature].map(temp).map(lambda x: min(10, x)).astype(np.uint8)
    df_test[feature + 'vc'] = df_test[feature].map(temp).map(lambda x: min(10, x)).astype(np.uint8)

for feature in features:
    df_train[feature + 'sum'] = ((df_train[feature] - df_all_copy[feature].mean()) * df_train[feature + 'vc'] \
                                 .map(lambda x: int(x > 1))).astype(np.float32)
    df_test[feature + 'sum'] = ((df_test[feature] - df_all_copy[feature].mean()) * df_test[feature + 'vc'] \
                                .map(lambda x: int(x > 1))).astype(np.float32) 

for feature in features:
    df_train[feature + 'sum2'] = ((df_train[feature]) * df_train[feature + 'vc'] \
                                  .map(lambda x: int(x > 2))).astype(np.float32)
    df_test[feature + 'sum2'] = ((df_test[feature]) * df_test[feature + 'vc'] \
                                 .map(lambda x: int(x > 2))).astype(np.float32)

for feature in features:
    df_train[feature + 'sum3'] = ((df_train[feature]) * df_train[feature + 'vc'] \
                                  .map(lambda x: int(x > 4))).astype(np.float32) 
    df_test[feature + 'sum3'] = ((df_test[feature]) * df_test[feature + 'vc'] \
                                 .map(lambda x: int(x > 4))).astype(np.float32)

print('Training set shape after creating magic features: {}'.format(df_train.shape))
print('Test set shape after creating magic features: {}'.format(df_test.shape))

df_train.to_csv("/content/drive/MyDrive/Data Science Project - Team D/data/fe1-sum/train_fe1_sum.csv", index=False)
df_test.to_csv("/content/drive/MyDrive/Data Science Project - Team D/data/fe1-sum/test_fe1_sum.csv", index=False)

Training set shape after creating magic features: (200000, 1002)
Test set shape after creating magic features: (200000, 1001)


## Sum for LGBM

In [2]:
df_train = pd.read_csv('/content/drive/MyDrive/Data Science Project - Team D/data/raw-data/train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Data Science Project - Team D/data/raw-data/test.csv')

In [3]:
## Fake vs. Real data 

test = df_test.drop(['ID_code'], axis=1).values

unique_count = np.zeros_like(test)

for feature in range(test.shape[1]):
    _, index, count = np.unique(test[:, feature], return_counts=True, return_index=True)
    unique_count[index[count == 1], feature] += 1
    
real_samples = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
synth_samples = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]

In [4]:
## Training model & returning submission

features = [col for col in df_train.columns if col.startswith('var')]
df_all = pd.concat([df_train, df_test.iloc[real_samples]])

for feature in features:
    temp = df_all[feature].value_counts(dropna=True)

    df_train[feature + 'vc'] = df_train[feature].map(temp).map(lambda x: min(10, x)).astype(np.uint8)
    df_test[feature + 'vc'] = df_test[feature].map(temp).map(lambda x: min(10, x)).astype(np.uint8)

    df_train[feature + 'sum'] = ((df_train[feature] - df_all[feature].mean()) * df_train[feature + 'vc'] \
                                 .map(lambda x: int(x > 1))).astype(np.float32)
    df_test[feature + 'sum'] = ((df_test[feature] - df_all[feature].mean()) * df_test[feature + 'vc'] \
                                .map(lambda x: int(x > 1))).astype(np.float32) 

    df_train[feature + 'sum2'] = ((df_train[feature]) * df_train[feature + 'vc'] \
                                  .map(lambda x: int(x > 2))).astype(np.float32)
    df_test[feature + 'sum2'] = ((df_test[feature]) * df_test[feature + 'vc'] \
                                 .map(lambda x: int(x > 2))).astype(np.float32)

    df_train[feature + 'sum3'] = ((df_train[feature]) * df_train[feature + 'vc'] \
                                  .map(lambda x: int(x > 4))).astype(np.float32) 
    df_test[feature + 'sum3'] = ((df_test[feature]) * df_test[feature + 'vc'] \
                                 .map(lambda x: int(x > 4))).astype(np.float32)
    
print('Training set shape after creating magic features: {}'.format(df_train.shape))
print('Test set shape after creating magic features: {}'.format(df_test.shape))

df_train.to_csv("/content/drive/MyDrive/Data Science Project - Team D/data/fe1-sum/train_fe1_sum_lgbm.csv", index=False)
df_test.to_csv("/content/drive/MyDrive/Data Science Project - Team D/data/fe1-sum/test_fe1_sum_lgbm.csv", index=False)

Training set shape after creating magic features: (200000, 1002)
Test set shape after creating magic features: (200000, 1001)
