In [28]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest,SelectPercentile, SelectFpr,SelectFdr,GenericUnivariateSelect,chi2,f_regression
from sklearn.feature_selection import RFE
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import warnings
%matplotlib inline

In [29]:
warnings.filterwarnings('ignore')

In [2]:
p_num = 1
original_df = pd.read_csv(r'C:\Users\james\Documents\Hishtalmut_Kamdan\Targil 1 - Data Visualiztion\My - Home Credit Default Risk.csv')
original_df.drop(columns=['Unnamed: 0'], inplace=True)

In [3]:
def get_object_column_names(df):
    types_df = df.dtypes.reset_index()
    cols_list = list(types_df['index'][types_df[0]=='object'])
    return cols_list

In [5]:
def preprocess_data(df):
    # remove NAN values
    df_preprocessed = df.fillna(method='ffill')
    df_preprocessed[df_preprocessed['CNT_CHILDREN'].isnull()] = df_preprocessed[df_preprocessed['CNT_CHILDREN'].isnull()].fillna(0)
    # remove lines with TARGET=2
    target_2_index_list = list(df_preprocessed[df_preprocessed['TARGET']==2].index)
    df_clean_target = df_preprocessed.drop(df_preprocessed.index[target_2_index_list])
    # transform object types to int
    df_dummies = df_clean_target.copy()
    for col in get_object_column_names(df_dummies):
        df_dummies = pd.concat([df_dummies.drop(col,axis=1), pd.get_dummies(df_dummies[col], prefix=col)], axis=1)
    return df_dummies

In [6]:
processed_data = preprocess_data(original_df)

In [8]:
X = processed_data.drop('TARGET', axis=1)
y = processed_data['TARGET']

In [11]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Feature Selection - Wrapper Methods 

### Run model without feature selection

In [35]:
clf = RandomForestClassifier(n_estimators=500)

clf_fit = clf.fit(x_train,y_train)
y_pred_train = clf_fit.predict(x_train)
y_pred_test = clf_fit.predict(x_test)

print("train score is: " + str(f1_score(y_train, y_pred_train, average='weighted')))
print("test score is: " + str(f1_score(y_test, y_pred_test, average='weighted')))

train score is: 1.0
test score is: 0.6915750915750918


### Recursive feature elimination

In [36]:
estimator = RandomForestClassifier(n_estimators=500)
selector = RFE(estimator,n_features_to_select = 30, step=1)
selector = selector.fit(x_train, y_train)
y_pred_train = selector.predict(x_train)
y_pred_test = selector.predict(x_test)
                               
print("train score is: " + str(f1_score(y_train, y_pred_train, average='weighted')))
print("test score is: " + str(f1_score(y_test, y_pred_test, average='weighted')))

train score is: 1.0
test score is: 0.6463215060229985
