## In this notebook, we will create the golden test size to prevent data leakage

In this dataset, there are not much preprocessing that is needed and since the columns are anonymized, we cannot do much feature engineering on the data except for feature crosses/polynomial features

In [24]:
%load_ext autoreload
%autoreload 2

In [4]:
import os
os.chdir("..")

In [25]:
from src.transform import generate_golden_test
import pandas as pd

#sklearn libraries
from sklearn.preprocessing import PolynomialFeatures


In [28]:
generate_golden_test("data/creditcard.csv", "Class", "data")

In [27]:
## Checking split
train_df=pd.read_csv("data/train.csv", index_col=0)
golden_df=pd.read_csv("data/golden.csv", index_col=0)

In [16]:
print(f"Training Set: {len(train_df)} Percentage {len(train_df)/(len(train_df)+len(golden_df))}")
print(train_df["Class"].value_counts())
print(f"Percentage fraud {train_df['Class'].sum()/len(train_df)}")
print()
print(f"Golden Set: {len(train_df)} Percentage {len(golden_df)/(len(train_df)+len(golden_df))}")
print(golden_df["Class"].value_counts())
print(f"Percentage fraud {golden_df['Class'].sum()/len(golden_df)}")


Training Set: 242085 Percentage 0.8499966644078271
0    241667
1       418
Name: Class, dtype: int64
Percentage fraud 0.0017266662535886156

Golden Set: 242085 Percentage 0.15000333559217294
0    42648
1       74
Name: Class, dtype: int64
Percentage fraud 0.0017321286456626562


## Feature cross

In [8]:
train_df

Unnamed: 0_level_0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
152194.0,2.082004,-0.818569,-1.135807,-0.173036,-0.703332,-0.669019,-0.458621,-0.101388,-0.103272,0.864830,...,-0.150079,0.286583,0.051542,0.000384,0.179212,-0.070597,0.009024,-0.067930,11.50,0
163116.0,1.961180,-0.132630,-1.204767,0.517382,-0.064926,-1.186268,0.369890,-0.391319,0.697708,-0.152840,...,-0.182646,-0.393861,0.192813,0.045326,-0.008818,-0.514709,-0.017378,-0.043669,48.62,0
159924.0,-1.748416,1.099935,-0.456043,-0.500196,1.140714,0.133756,0.715703,-0.180735,-0.109877,0.536876,...,0.291384,0.965511,-0.323269,-1.387123,0.421327,0.041559,-1.224216,-0.040818,2.99,0
61817.0,0.794243,-1.688863,1.341493,-0.115494,-2.018830,0.272800,-1.087525,0.160009,0.016777,0.324680,...,0.315605,0.672284,-0.272175,0.636383,0.338395,-0.178151,0.032779,0.063393,250.00,0
76825.0,-0.416285,0.588658,1.185738,-2.091479,0.033349,-1.322596,1.001690,-0.247473,0.724273,-0.949287,...,0.113082,0.615876,-0.270214,0.511572,-0.020916,-0.865917,0.228222,-0.024121,1.00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147710.0,1.993864,-0.516866,-0.620118,0.129845,-0.285128,0.395044,-0.822358,0.231591,0.995898,0.212619,...,0.262526,0.884510,0.099141,0.275689,-0.195404,0.623598,-0.032455,-0.058552,5.99,0
60764.0,-1.497933,0.657921,1.581568,-0.024286,0.584698,1.303031,0.609212,0.135561,0.452745,0.108640,...,-0.072452,0.299172,0.110048,-0.615980,-0.425883,0.263968,-0.448445,0.045178,36.99,0
35301.0,1.069777,0.072105,0.496540,1.505318,-0.380277,-0.370243,0.100551,-0.026687,0.319684,-0.131553,...,-0.061991,-0.044629,-0.050485,0.400171,0.593314,-0.335160,0.031014,0.024886,45.42,0
24413.0,1.280465,0.300586,0.333044,0.512720,0.065052,-0.145844,-0.145519,-0.091833,1.111187,-0.268878,...,-0.409934,-0.961351,0.033153,-0.560429,0.278428,0.089546,-0.059835,-0.005887,0.89,0


## Featurecross

In [37]:
generate_golden_test("data/creditcard.csv", "Class", "data", feature_cross=True, train_filename="train_feature_cross.csv", golden_filename="golden_feature_cross.csv")

['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class']
