# Preprocessing part
Description of the project

### Librairies

In [1]:
# Classic.
import pandas as pd
import numpy as np

# Data Viz.
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing.
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler

# Import custom functions.
from Utils.utils_preprocessing import *

### Settings

In [2]:
# Format & option.
sns.set(rc={'figure.figsize':(16,9)})
pd.options.display.max_columns = 100
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Style use.
sns.set_style('darkgrid')

# Filter warnings.
import warnings
warnings.simplefilter(action='ignore')

### Import data

In [3]:
# Read data.
df_app_train = pd.read_csv("Data/application_train.csv")

## Start Cleaning and Preprocessing

- Delete observation with not enough information
- Filter outliers
- Delete categorical features with not enough information

In [4]:
# Downsampling of the majority class.
df_app_train['Nbr_nan'] = df_app_train[df_app_train['TARGET']==0].isna().sum(axis=1)
df_app_train = df_app_train[(df_app_train.Nbr_nan < 48) | (df_app_train.Nbr_nan != np.nan)]
del df_app_train['Nbr_nan'] 

In [5]:
# Replace outliers by NaN
df_app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

In [6]:
# Check missing values on features.
df_missing_values = missing_values_table(df_app_train)

# Check on categorical features.
# Delete 1 feature with too many missing values.
df_missing_values[df_missing_values.Feature_type == object]
del df_app_train["FONDKAPREMONT_MODE"]

# Check on numerical features.
# Delete 1 feature with too many missing values.
df_missing_values[df_missing_values.Feature_type == 'float64']
del df_app_train["OWN_CAR_AGE"]

Your selected dataframe has 122 columns.
There are 68 columns that have missing values.


## Split Preprocessing for numerical and categorical features

In [7]:
# Split in training and testing set.
X, y = df_app_train.iloc[:, df_app_train.columns != "TARGET"], df_app_train.TARGET
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Store id_clients.
id_clients_train = X_train["SK_ID_CURR"]
id_clients_test = X_test["SK_ID_CURR"]

# Deleted id_clients before preprocessing.
del X_train["SK_ID_CURR"]
del X_test["SK_ID_CURR"]

### Numerical

In [8]:
# Imputation on continuous features.
X_train_num, X_test_num = X_train.select_dtypes(exclude=["object"]), X_test.select_dtypes(exclude=["object"])

# Imputation.
impute = IterativeImputer(n_nearest_features=15, imputation_order='ascending', random_state=42)
X_train_num = pd.DataFrame(impute.fit_transform(X_train_num), columns=X_train_num.columns)
X_test_num = pd.DataFrame(impute.transform(X_test_num), columns=X_test_num.columns)

# Rescaling.
scaler = MinMaxScaler(feature_range=(0, 1))
X_train_num = pd.DataFrame(scaler.fit_transform(X_train_num), columns=X_train_num.columns)
X_test_num = pd.DataFrame(scaler.fit_transform(X_test_num), columns=X_train_num.columns)

### Categorical

In [9]:
# Encoding and imputation on categorical features.
X_train_categ, X_test_categ = impute_cat_feature(X_train, X_test, y_train)
X_train_categ.dropna(inplace=True, axis=1)
X_test_categ.dropna(inplace=True, axis=1)

In [16]:
# Reset indexes.
X_train_num.reset_index(inplace=True, drop=True)
X_train_categ.reset_index(inplace=True, drop=True)
X_test_num.reset_index(inplace=True, drop=True)
X_test_categ.reset_index(inplace=True, drop=True)

# Concat.
X_train = pd.concat([X_train_num, X_train_categ], axis=1)
X_test = pd.concat([X_test_num, X_test_categ], axis=1)

# Reindex with id_clients
X_train.SK_ID_CURR = id_clients_train
X_test.SK_ID_CURR = id_clients_test

In [18]:
# Export data.
X_train.to_csv("Data/X_train.csv", index=False)
X_test.to_csv("Data/X_test.csv", index=False)