## CONVERTENDO ATRIBUTOS

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_path = 'data/credit-score/clean_train.csv'
test_path  = 'data/credit-score/clean_test.csv'

train_df = pd.read_csv(train_path)
test_df  = pd.read_csv(test_path)

In [3]:
train_data = train_df.values
test_data  = test_df.values

train_data

array([[0, 'CUS_0xd40', 23, ..., 'High_spent_Small_value_payments',
        312.49408867943663, 'Good'],
       [1, 'CUS_0xd40', 23, ..., 'Low_spent_Large_value_payments',
        284.62916249607184, 'Good'],
       [2, 'CUS_0xd40', 23, ..., 'Low_spent_Medium_value_payments',
        331.2098628537912, 'Good'],
       ...,
       [99997, 'CUS_0x942c', 25, ..., 'High_spent_Large_value_payments',
        395.684889347144, 'Poor'],
       [99998, 'CUS_0x942c', 25, ..., 'Low_spent_Large_value_payments',
        395.684889347144, 'Standard'],
       [99999, 'CUS_0x942c', 25, ..., 'High_spent_Large_value_payments',
        395.684889347144, 'Poor']], dtype=object)

In [4]:
# Eliminando colunas que não usaremos na classificação
drop_cols = [0, 1, 9]  # Unnamed, Customer_ID e Type_of_Loan

train_data = np.delete(train_data, obj=drop_cols, axis=1)
test_data  = np.delete(test_data, obj=drop_cols, axis=1)

In [5]:
# Separando target
train_y = train_data[:, -1]
train_x = train_data[:, :-1]

# test_y = test_data[:, -1]
test_x = test_data[:, :-1]

In [6]:
# Encoding categorical attributes from target
from sklearn.preprocessing import OrdinalEncoder

train_y = OrdinalEncoder().fit_transform(np.reshape(train_y, (-1, 1)))
print(train_y)

# Encoding categorical attributes from data
from sklearn.preprocessing import OneHotEncoder

code_cols = [1, 11, 17]
print(train_x[:, code_cols])

# Talvez fosse interessante transformar Occupation em Exatas, Humanas, etc,
# por causa do número de novas colunas
new_occupation   = OneHotEncoder(drop='first').fit_transform(np.reshape(train_x[:, code_cols[0]], (-1, 1))).toarray()
new_creditmix    = OneHotEncoder(drop='first').fit_transform(np.reshape(train_x[:, code_cols[1]], (-1, 1))).toarray()
new_paybehaviour = OneHotEncoder(drop='first').fit_transform(np.reshape(train_x[:, code_cols[2]], (-1, 1))).toarray()

temp = np.delete(train_x, obj=code_cols, axis=1)
temp = np.append(temp, new_occupation, axis=1)
temp = np.append(temp, new_creditmix, axis=1)
temp = np.append(temp, new_paybehaviour, axis=1)
train_x = temp

# Agora pro testset
new_occupation   = OneHotEncoder(drop='first').fit_transform(np.reshape(test_x[:, code_cols[0]], (-1, 1))).toarray()
new_creditmix    = OneHotEncoder(drop='first').fit_transform(np.reshape(test_x[:, code_cols[1]], (-1, 1))).toarray()
new_paybehaviour = OneHotEncoder(drop='first').fit_transform(np.reshape(test_x[:, code_cols[2]], (-1, 1))).toarray()

temp = np.delete(test_x, obj=code_cols, axis=1)
temp = np.append(temp, new_occupation, axis=1)
temp = np.append(temp, new_creditmix, axis=1)
temp = np.append(temp, new_paybehaviour, axis=1)
test_x = temp

print(train_x)
print(test_x)

[[0.]
 [0.]
 [0.]
 ...
 [1.]
 [2.]
 [1.]]
[['Scientist' 'Good' 'High_spent_Small_value_payments']
 ['Scientist' 'Good' 'Low_spent_Large_value_payments']
 ['Scientist' 'Good' 'Low_spent_Medium_value_payments']
 ...
 ['Mechanic' 'Good' 'High_spent_Large_value_payments']
 ['Mechanic' 'Good' 'Low_spent_Large_value_payments']
 ['Mechanic' 'Good' 'High_spent_Large_value_payments']]
[[23 1824.8433333333328 3 ... 0.0 0.0 0.0]
 [23 1824.8433333333328 3 ... 1.0 0.0 0.0]
 [23 1824.8433333333328 3 ... 0.0 1.0 0.0]
 ...
 [25 3359.415833333333 4 ... 0.0 0.0 0.0]
 [25 3359.415833333333 4 ... 1.0 0.0 0.0]
 [25 3359.415833333333 4 ... 0.0 0.0 0.0]]
[[23 1824.8433333333328 3 ... 0.0 0.0 1.0]
 [23 1824.8433333333328 3 ... 0.0 0.0 0.0]
 [23 1824.8433333333328 3 ... 0.0 1.0 0.0]
 ...
 [25 3359.415833333333 4 ... 1.0 0.0 0.0]
 [25 3359.415833333333 4 ... 0.0 0.0 0.0]
 [25 3359.415833333333 4 ... 0.0 1.0 0.0]]


In [7]:
# Saving train and test sets in CSV format

save_dir = 'data/credit-score'

train_save = np.append(train_x, train_y, axis=1)
train_save

pd.DataFrame(train_save).to_csv(f'{save_dir}/ready_train.csv', header=None, index=None)
pd.DataFrame(test_x).to_csv(f'{save_dir}/ready_test.csv', header=None, index=None)