# Exploratory Data Analysis Part 1: Data Selection

In [55]:
# Libraries importing
import pandas as pd

In [56]:
# Get training data from .txt file
data = pd.read_csv('../data/PAKDD2010_Modeling_Data.txt',
                   header=None,
                   delimiter='\t',
                   encoding='ISO-8859-1',
                   index_col=0
                   )

  data = pd.read_csv('../data/PAKDD2010_Modeling_Data.txt',


In [57]:
# Get name and description of data columns
var_names = pd.read_excel('../data/PAKDD2010_VariablesList.XLS')

In [58]:
# Analyze if there are duplicated variable names
var_names['Var_Title'].duplicated().value_counts()

False    53
True      1
Name: Var_Title, dtype: int64

In [59]:
# As there are a variable with the same name that other, let's rename the
# second.
cols = []
for column in var_names['Var_Title'][1:]:     # Isolate ID_CLIENT because it is
                                              # used as dataframe index
    if column in cols:
        count = cols.count(column) + 1
        cols.append(f'{column}_{count}')
    else:
        cols.append(column)

In [60]:
# Set Var_Title as Columns names for or training dataset
data.columns = cols

In [61]:
data.shape[0]

50000

In [62]:
# Count Null values per column
data.isna().sum()

CLERK_TYPE                            0
PAYMENT_DAY                           0
APPLICATION_SUBMISSION_TYPE           0
QUANT_ADDITIONAL_CARDS                0
POSTAL_ADDRESS_TYPE                   0
SEX                                   0
MARITAL_STATUS                        0
QUANT_DEPENDANTS                      0
EDUCATION_LEVEL                       0
STATE_OF_BIRTH                        0
CITY_OF_BIRTH                         0
NACIONALITY                           0
RESIDENCIAL_STATE                     0
RESIDENCIAL_CITY                      0
RESIDENCIAL_BOROUGH                   0
FLAG_RESIDENCIAL_PHONE                0
RESIDENCIAL_PHONE_AREA_CODE           0
RESIDENCE_TYPE                     1349
MONTHS_IN_RESIDENCE                3777
FLAG_MOBILE_PHONE                     0
FLAG_EMAIL                            0
PERSONAL_MONTHLY_INCOME               0
OTHER_INCOMES                         0
FLAG_VISA                             0
FLAG_MASTERCARD                       0


In [63]:
# Let's drop columns with sum of NaN values greater than the half of the total
# number of samples
cols_to_drop = list(data.isna().sum()[
    data.isna().sum() > data.shape[0]//2].index)
cols_to_drop

['PROFESSIONAL_CITY',
 'PROFESSIONAL_BOROUGH',
 'MATE_PROFESSION_CODE',
 'EDUCATION_LEVEL_2']

In [64]:
# Append columns with just one unique value to dols_to drop as they don't add
# value to the analysis
for col in data.columns:
    if len(data[col].unique()) == 1:
        if col not in cols_to_drop:
            cols_to_drop.append(col)

In [65]:
print('The columns to drop before correlation analysis are', cols_to_drop)

The columns to drop before correlation analysis are ['PROFESSIONAL_CITY', 'PROFESSIONAL_BOROUGH', 'MATE_PROFESSION_CODE', 'EDUCATION_LEVEL_2', 'CLERK_TYPE', 'QUANT_ADDITIONAL_CARDS', 'EDUCATION_LEVEL', 'FLAG_MOBILE_PHONE', 'FLAG_HOME_ADDRESS_DOCUMENT', 'FLAG_RG', 'FLAG_CPF', 'FLAG_INCOME_PROOF', 'FLAG_ACSP_RECORD']


In [66]:
# Drop columns with non-relevant information
train_data = data.drop(columns=cols_to_drop)
train_data

Unnamed: 0_level_0,PAYMENT_DAY,APPLICATION_SUBMISSION_TYPE,POSTAL_ADDRESS_TYPE,SEX,MARITAL_STATUS,QUANT_DEPENDANTS,STATE_OF_BIRTH,CITY_OF_BIRTH,NACIONALITY,RESIDENCIAL_STATE,...,FLAG_PROFESSIONAL_PHONE,PROFESSIONAL_PHONE_AREA_CODE,MONTHS_IN_THE_JOB,PROFESSION_CODE,OCCUPATION_TYPE,PRODUCT,AGE,RESIDENCIAL_ZIP_3,PROFESSIONAL_ZIP_3,TARGET_LABEL_BAD=1
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5,Web,1,F,6,1,RN,Assu,1,RN,...,N,,0,9.0,4.0,1,32,595,595,1
2,15,Carga,1,F,2,0,RJ,rio de janeiro,1,RJ,...,N,,0,11.0,4.0,1,34,230,230,1
3,5,Web,1,F,2,0,RN,GARANHUNS,1,RN,...,N,,0,11.0,,1,27,591,591,0
4,20,Web,1,F,2,0,PE,CABO,1,PE,...,N,,0,,,1,61,545,545,0
5,10,Web,1,M,2,0,RJ,RIO DE JANEIRO,1,RJ,...,N,,0,9.0,5.0,1,48,235,235,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49996,10,0,1,F,1,2,RN,NATAL,1,RN,...,N,,0,9.0,4.0,1,36,591,591,1
49997,25,0,1,F,1,0,SP,LENCOIS PAULISTA,1,SP,...,N,,0,,,2,21,186,186,0
49998,5,Web,1,M,2,3,PR,RIO BONITO,1,SP,...,Y,5,0,9.0,2.0,1,41,715,715,0
49999,1,Web,1,F,1,1,SP,SAO PAULO,1,MG,...,Y,29,0,9.0,2.0,1,28,320,320,1


In [67]:
# Save data to csv
train_data.to_csv("train_data.csv")

### Let's read the test and validation datasets in order to select the same variables we have in train_data dataset.

In [79]:
# Test set
test_data = pd.read_csv('../data/PAKDD2010_Leaderboard_Data.txt',
                        header=None,
                        delimiter='\t',
                        encoding='ISO-8859-1',
                        index_col=0
                       )
# Validation test
val_data = pd.read_csv('../data/PAKDD2010_Prediction_Data.txt',
                       header=None,
                       delimiter='\t',
                       encoding='ISO-8859-1',
                       index_col=0
                       )

In [80]:
# Set the same columns names for test and validation datasets than the training
# one
# First give name to columns
test_data.columns = cols[:-1]
val_data.columns  = cols[:-1]

# Second match the columns names with training dataset columns names
test_data = test_data[train_data.columns[:-1]]
val_data  = val_data[test_data.columns]

In [81]:
# Save test and validation datasets
test_data.to_csv("test_data.csv")
val_data.to_csv("val_data.csv")

Let's move onto the Correlation_Analysis Notebook to continue the preprocessing of the data.