In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# read data
data = pd.read_csv('Input/known_behaviors2.csv')
data

Unnamed: 0,customer_id,age,age_youngest_child,debt_equity,gender,bad_payment,gold_card,pension_plan,household_debt_to_equity_ratio,income,...,call_center_contacts,loan_accounts,number_products,number_transactions,non_worker_percentage,white_collar_percentage,rfm_score,Mortgage,Pension,Savings
0,15,45,12,45,0,0,0,0,65,13453,...,0,4,2,1,14,19,7.602,0,0,0
1,16,43,12,43,0,0,0,0,65,13453,...,0,0,3,2,14,19,10.143,0,0,0
2,30,23,0,23,0,0,0,0,65,13453,...,0,1,0,0,14,19,0.000,0,0,0
3,42,35,8,35,1,0,0,0,65,13453,...,0,1,0,0,14,19,0.000,0,1,0
4,52,43,12,43,1,0,0,0,47,14124,...,3,1,0,0,16,35,0.000,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11018,53170,36,3,36,1,0,0,0,53,51854,...,2,0,0,0,18,27,0.000,0,1,0
11019,53173,42,9,42,0,0,0,0,55,51941,...,2,3,0,0,11,32,0.000,0,1,1
11020,53179,39,11,39,1,0,0,0,66,51858,...,0,0,0,13,11,21,19.369,0,0,1
11021,53181,55,29,55,0,0,0,0,60,51943,...,1,0,0,0,10,28,0.000,0,0,0


In [3]:
# create a new column to combine Mortgage, Pension, and Savings
data['concatenated'] = data['Mortgage'].astype(str) + data['Pension'].astype(str) + data['Savings'].astype(str)
data

Unnamed: 0,customer_id,age,age_youngest_child,debt_equity,gender,bad_payment,gold_card,pension_plan,household_debt_to_equity_ratio,income,...,loan_accounts,number_products,number_transactions,non_worker_percentage,white_collar_percentage,rfm_score,Mortgage,Pension,Savings,concatenated
0,15,45,12,45,0,0,0,0,65,13453,...,4,2,1,14,19,7.602,0,0,0,000
1,16,43,12,43,0,0,0,0,65,13453,...,0,3,2,14,19,10.143,0,0,0,000
2,30,23,0,23,0,0,0,0,65,13453,...,1,0,0,14,19,0.000,0,0,0,000
3,42,35,8,35,1,0,0,0,65,13453,...,1,0,0,14,19,0.000,0,1,0,010
4,52,43,12,43,1,0,0,0,47,14124,...,1,0,0,16,35,0.000,0,1,0,010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11018,53170,36,3,36,1,0,0,0,53,51854,...,0,0,0,18,27,0.000,0,1,0,010
11019,53173,42,9,42,0,0,0,0,55,51941,...,3,0,0,11,32,0.000,0,1,1,011
11020,53179,39,11,39,1,0,0,0,66,51858,...,0,0,13,11,21,19.369,0,0,1,001
11021,53181,55,29,55,0,0,0,0,60,51943,...,0,0,0,10,28,0.000,0,0,0,000


In [4]:
data.concatenated.value_counts(normalize=True)

concatenated
000    0.404064
001    0.252654
010    0.118298
100    0.075297
011    0.074027
101    0.045178
110    0.019323
111    0.011158
Name: proportion, dtype: float64

In [5]:
#N = 1000
#data = data.groupby('concatenated', group_keys=False).apply(lambda x: x.sample(int(np.rint(N*len(x)/len(data))))).sample(frac=1).reset_index(drop=True)
#data.concatenated.value_counts(normalize=True)

In [6]:
# Perform stratified split to split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(columns=["concatenated"]),  # Use the features without the "concatenated" column
    data["concatenated"],  # The target variable
    test_size=0.2,
    stratify=data["concatenated"]
)

In [7]:
# check the the distribution of the target variable in the train and test sets
print("Train set distribution:")
print(y_train.value_counts(normalize=True))
print()
print("Test set distribution:")
print(y_test.value_counts(normalize=True))

Train set distribution:
concatenated
000    0.404060
001    0.252665
010    0.118281
100    0.075301
011    0.074053
101    0.045135
110    0.019279
111    0.011227
Name: proportion, dtype: float64

Test set distribution:
concatenated
000    0.404082
001    0.252608
010    0.118367
100    0.075283
011    0.073923
101    0.045351
110    0.019501
111    0.010884
Name: proportion, dtype: float64


In [8]:
# reset the value of y
y_train = X_train.loc[:,['Mortgage', 'Pension', 'Savings']]
y_test = X_test.loc[:,['Mortgage', 'Pension', 'Savings']]
# drop the columns from X_train and X_test
X_train = X_train.drop(columns=['customer_id', 'Mortgage', 'Pension', 'Savings'])
X_test = X_test.drop(columns=['customer_id', 'Mortgage', 'Pension', 'Savings'])
# get the column names of the features
X_columns = X_train.columns

In [9]:
# standardize the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# turn the data into a dataframe with the column names
X_train = pd.DataFrame(X_train, columns=X_columns)
X_test = pd.DataFrame(X_test, columns=X_columns)

In [10]:
X_train

Unnamed: 0,age,age_youngest_child,debt_equity,gender,bad_payment,gold_card,pension_plan,household_debt_to_equity_ratio,income,members_in_household,months_current_account,months_customer,call_center_contacts,loan_accounts,number_products,number_transactions,non_worker_percentage,white_collar_percentage,rfm_score
0,-0.209025,0.088242,-0.209025,1.021784,-0.05438,-0.177725,-0.07702,-1.191930,-0.708545,0.398425,-1.100205,-1.064396,1.117583,0.656470,0.222585,0.008422,0.011183,1.239813,0.326410
1,0.656889,0.969800,0.656889,-0.978680,-0.05438,-0.177725,-0.07702,0.259330,-1.048210,-1.311494,-0.201499,-0.237377,-0.158208,2.111478,1.348663,0.008422,0.274139,-0.458193,0.699145
2,0.137340,0.480046,0.137340,1.021784,-0.05438,-0.177725,-0.07702,-0.673623,0.251026,1.538372,-0.426175,-0.237377,0.479688,1.383974,0.785624,-0.208086,-0.514730,0.900211,0.780277
3,-0.641982,-0.891266,-0.641982,-0.978680,-0.05438,-0.177725,-0.07702,1.606928,0.368440,-1.311494,1.221452,1.416663,-1.433999,-0.798538,-0.903493,-0.424593,-0.251774,-1.590197,-0.867649
4,-0.815165,-0.695364,-0.815165,1.021784,-0.05438,-0.177725,-0.07702,-0.777284,0.160493,-0.741521,-1.998911,-1.891416,0.479688,-0.798538,-0.903493,-0.424593,-0.514730,1.013412,-0.867649
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8813,-0.122434,-0.009708,-0.122434,-0.978680,-0.05438,5.626656,-0.07702,0.155668,-0.506550,0.398425,0.622315,0.589643,-0.158208,-0.071034,1.348663,-0.208086,0.800052,-0.458193,0.289872
8814,0.137340,-0.205610,0.137340,1.021784,-0.05438,-0.177725,-0.07702,0.570314,0.817551,0.398425,1.521021,1.416663,-0.796104,-0.798538,1.911702,0.224929,0.537096,-0.910995,0.500204
8815,-0.555391,-0.499463,-0.555391,1.021784,-0.05438,-0.177725,-0.07702,-0.673623,0.876690,-0.171548,1.371237,1.416663,0.479688,0.656470,0.785624,0.008422,3.429617,-0.797794,0.306127
8816,-0.555391,-0.695364,-0.555391,-0.978680,-0.05438,-0.177725,-0.07702,0.052007,-1.458536,0.968399,-1.025313,-1.064396,-0.158208,-0.071034,-0.903493,-0.424593,-1.040643,0.334209,-0.867649


In [11]:
# save the data
X_train.to_csv('Preprocessed/X_train_scaled.csv', index=False)
X_test.to_csv('Preprocessed/X_test_scaled.csv', index=False)
y_train.to_csv('Preprocessed/y_train.csv', index=False)
y_test.to_csv('Preprocessed/y_test.csv', index=False)