In [1]:
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import pandas as pd

In [2]:
df = pd.read_csv('TRNcod.csv', sep='\t')
df = df.drop(['INDEX', 'IND_BOM_1_2'], axis=1)
df.head(2)

Unnamed: 0,UF_1,UF_2,UF_3,UF_4,UF_5,UF_6,UF_7,IDADE,SEXO_1,NIVEL_RELACIONAMENTO_CREDITO01,...,CEP4_6,CEP4_7,CEP4_8,CEP4_9,CEP4_10,CEP4_11,CEP4_12,CEP4_13,CEP4_14,IND_BOM_1_1
0,1,1,1,0,0,0,0,0.135098,1,0.222222,...,0,0,0,1,1,0,1,1,1,0
1,1,0,1,0,0,1,0,0.273504,1,0.111111,...,0,0,1,0,1,1,0,0,0,1


### **Splitting data**

In [3]:
X = df.drop(columns=['IND_BOM_1_1'])
y = df['IND_BOM_1_1']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.67, stratify=y)

In [4]:
train_df = pd.concat([X_train, y_train], axis=1)
valid_df = pd.concat([X_valid, y_valid], axis=1)

In [5]:
print(f'training data: {round(len(train_df) / len(df), 2)}')
print(f'valid data: {round(len(valid_df) / len(df), 2)}')

training data: 0.67
valid data: 0.33


### **Classes Balance**

In [6]:
train_df['IND_BOM_1_1'].value_counts(normalize=True)

1    0.655447
0    0.344553
Name: IND_BOM_1_1, dtype: float64

In [7]:
train_class0 = train_df[train_df['IND_BOM_1_1'] == 0]
train_class1 = train_df[train_df['IND_BOM_1_1'] == 1]

In [8]:
train_class1_downsampled = resample(train_class1, replace=False, n_samples=len(train_class0), random_state=42)
train_df_downsampled = pd.concat([train_class0, train_class1_downsampled])

In [9]:
train_df_downsampled = train_df_downsampled.sample(frac=1, random_state=42)

In [10]:
train_df_downsampled['IND_BOM_1_1'].value_counts(normalize=True)

1    0.5
0    0.5
Name: IND_BOM_1_1, dtype: float64

In [11]:
len(train_df_downsampled)

179692

In [12]:
len(valid_df)

128435

In [13]:
train_df_downsampled.to_csv('training.csv', index=False)
valid_df.to_csv('validation.csv', index=False)

### **Handling test data**

In [19]:
df_test = pd.read_csv('TST.csv')
df_test = df_test.drop(['INDEX', 'IND_BOM_1_2'], axis=1)
df_test.head(2)

Unnamed: 0,UF_1,UF_2,UF_3,UF_4,UF_5,UF_6,UF_7,IDADE,SEXO_1,NIVEL_RELACIONAMENTO_CREDITO01,...,CEP4_6,CEP4_7,CEP4_8,CEP4_9,CEP4_10,CEP4_11,CEP4_12,CEP4_13,CEP4_14,IND_BOM_1_1
0,1,1,1,0,0,0,0,0.898745,1,0.111111,...,1,0,1,1,1,0,0,0,0,1
1,1,1,0,1,0,0,0,0.847404,1,0.111111,...,0,0,0,0,1,1,1,0,0,1


In [20]:
valid_df.to_csv('test.csv', index=False)