In [103]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from functools import reduce
import re as re
from matplotlib.backends.backend_pdf import PdfPages
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
import warnings
from sklearn_pandas import DataFrameMapper
from sklearn_pandas import CategoricalImputer
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures  
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.multiclass import  OneVsRestClassifier
from sklearn.model_selection import train_test_split
import multilabel_train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

In [73]:
train = pd.read_csv("DataSet/application_train.csv")
test = pd.read_csv("DataSet/application_test.csv")

In [74]:
def reduce_mem(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [75]:
test = reduce_mem(test)
train = reduce_mem(train)

Memory usage of dataframe is 45.00 MB
Memory usage after optimization is: 14.60 MB
Decreased by 67.6%
Memory usage of dataframe is 286.23 MB
Memory usage after optimization is: 92.38 MB
Decreased by 67.7%


##### I realize the CODE_GENDER=="XNA" (only 4 observations with TARGET of 0) doesn't show up in the test data, but it's messing up with the label encoder, so I droped them all along.


In [64]:
train = train.drop(index = train.loc[train.CODE_GENDER=="XNA"].index)

In [76]:
def encode(df):
    le = LabelEncoder()
    oe = OneHotEncoder()
    le_col=[]
    for col in df:
        if df[col].dtype =="object":
            if df[col].nunique()==2:
                df[col].replace(np.nan, "NAN", inplace=True)
                le_col.append(col)
                le.fit(df[col])
                df[col] = le.transform(df[col])
    print("These {} columns ({}) are label encoded".format(len(le_col), le_col))
    print("Dimensions before OneHotEncoding : {}".format(df.shape))
    df = pd.get_dummies(df)
    print("Dimensions after OneHotEncoding : {}".format(df.shape))
    
    return df
                
train =encode(train)
train.head()
test = encode(test)
test.head()

These 4 columns (['NAME_CONTRACT_TYPE', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'EMERGENCYSTATE_MODE']) are label encoded
Dimensions before OneHotEncoding : (307511, 122)
Dimensions after OneHotEncoding : (307511, 242)
These 5 columns (['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'EMERGENCYSTATE_MODE']) are label encoded
Dimensions before OneHotEncoding : (48744, 121)
Dimensions after OneHotEncoding : (48744, 237)


Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,HOUSETYPE_MODE_block of flats,HOUSETYPE_MODE_specific housing,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden
0,100001,0,0,0,1,0,135000.0,568800.0,20560.5,450000.0,...,1,0,0,0,0,0,0,0,1,0
1,100005,0,1,0,1,0,99000.0,222768.0,17370.0,180000.0,...,0,0,0,0,0,0,0,0,0,0
2,100013,0,1,1,1,0,202500.0,663264.0,69777.0,630000.0,...,0,0,0,0,0,0,0,0,0,0
3,100028,0,0,0,1,2,315000.0,1575000.0,49018.5,1575000.0,...,1,0,0,0,0,0,0,1,0,0
4,100038,0,1,1,0,1,180000.0,625500.0,32067.0,625500.0,...,0,0,0,0,0,0,0,0,0,0


In [77]:
def align(train, test):
    
    print(" Shape of train data before alinging: {}".format(train.shape),"\n", 
         "Shape of test data before alinging: {}".format(test.shape))
    target = train["TARGET"]
    train, test = train.align(test, join="inner", axis =1)
    train["TARGET"] = target
    print(" Shape of train data after alinging: {}".format(train.shape), "\n", 
     "Shape of test data after alinging: {}".format(test.shape))
    return train, test
train, test = align(train, test)

 Shape of train data before alinging: (307511, 242) 
 Shape of test data before alinging: (48744, 237)
 Shape of train data after alinging: (307511, 237) 
 Shape of test data after alinging: (48744, 236)


In [78]:
def anomaly_correction(df, feature = "DAYS_EMPLOYED", integer=365243):
    df['Birth_ANOMALY'] = (df[feature]==integer).astype(int)

    df[feature].replace({integer: np.nan}, inplace =True )
    return df
train = anomaly_correction(train, feature = "DAYS_EMPLOYED", integer=365243)
test = anomaly_correction(test, feature = "DAYS_EMPLOYED", integer=365243)

In [79]:
# Changing the negative values for some features, which should be positive, into positive.
train[["DAYS_EMPLOYED","DAYS_BIRTH"]] = train[["DAYS_EMPLOYED","DAYS_BIRTH"]].apply(lambda x: x*(-1))
test[["DAYS_EMPLOYED","DAYS_BIRTH"]]= test[["DAYS_EMPLOYED","DAYS_BIRTH"]].apply(lambda x: x*(-1))

In [33]:
corr = train.corr()['TARGET'].sort_values()
print("Negatively correlated features are:\n",corr.head(20))

Negatively correlated features are:
 EXT_SOURCE_3                           -0.178925
EXT_SOURCE_2                           -0.160471
EXT_SOURCE_1                           -0.155318
DAYS_BIRTH                             -0.078242
DAYS_EMPLOYED                          -0.074957
NAME_EDUCATION_TYPE_Higher education   -0.056593
NAME_INCOME_TYPE_Pensioner             -0.046211
ORGANIZATION_TYPE_XNA                  -0.045989
Birth_ANOMALY                          -0.045989
FLOORSMAX_AVG                          -0.044010
FLOORSMAX_MEDI                         -0.043775
FLOORSMAX_MODE                         -0.043233
HOUSETYPE_MODE_block of flats          -0.040592
AMT_GOODS_PRICE                        -0.039647
EMERGENCYSTATE_MODE                    -0.039408
REGION_POPULATION_RELATIVE             -0.037229
ELEVATORS_AVG                          -0.034202
ELEVATORS_MEDI                         -0.033866
FLOORSMIN_AVG                          -0.033620
FLOORSMIN_MEDI                  

In [35]:
print("Positively correlated features are:\n",corr.tail(20))

Positively correlated features are:
 NAME_HOUSING_TYPE_With parents                       0.029965
OCCUPATION_TYPE_Drivers                              0.030303
DEF_60_CNT_SOCIAL_CIRCLE                             0.031295
DEF_30_CNT_SOCIAL_CIRCLE                             0.032261
LIVE_CITY_NOT_WORK_CITY                              0.032517
OWN_CAR_AGE                                          0.037611
DAYS_REGISTRATION                                    0.041976
OCCUPATION_TYPE_Laborers                             0.043017
FLAG_DOCUMENT_3                                      0.044341
REG_CITY_NOT_LIVE_CITY                               0.044394
FLAG_EMP_PHONE                                       0.045984
NAME_EDUCATION_TYPE_Secondary / secondary special    0.049822
REG_CITY_NOT_WORK_CITY                               0.050992
DAYS_ID_PUBLISH                                      0.051457
CODE_GENDER                                          0.054710
DAYS_LAST_PHONE_CHANGE           

In [98]:
def features_engd(features, df_train, df_test, degree):
    """ Features are given as a list."""
    df_train_eng = df_train[features]
    df_test_eng = df_test[features]
    
    # Imputations
    imputer = Imputer(strategy='mean')
    # Instantiating polynomialfeaures
    pf = PolynomialFeatures(degree, include_bias =False)
    
    df_train_eng =imputer.fit_transform(df_train_eng)
    df_train_eng =pf.fit_transform(df_train_eng)
    
    df_test_eng =imputer.transform(df_test_eng)
    df_test_eng =pf.transform(df_test_eng)
    print("Shape of the Engineered Treaining Data: {}".format(df_train_eng.shape))
    print("Shape of the Engineered Test Data: {}".format(df_test_eng.shape))
    print("First 15 Engineered Features: {}".format(pf.get_feature_names(input_features = features)[:15]))
    
    # Converting the engineered features into a data frame
    df_train_eng = pd.DataFrame(df_train_eng, columns=pf.get_feature_names(input_features = features))
    df_test_eng = pd.DataFrame(df_test_eng, columns=pf.get_feature_names(input_features = features))

        
    # Now merging the engineered dataframes
    df_train_eng["SK_ID_CURR"] =df_train["SK_ID_CURR"]
    df_test_eng["SK_ID_CURR"] =df_test["SK_ID_CURR"]
    
    
    df_train = df_train.merge(df_train_eng, how= "left", on = "SK_ID_CURR")
    df_test = df_test.merge(df_test_eng, how= "left", on = "SK_ID_CURR")
        
    # Aligning the engineered train and test data
    if df_train.shape != df_test.shape:
        df_train, df_test = align(df_train, df_test)
    
    # Adding the "TARGET" features to the engineered dataframe 
    df_train_eng["TARGET"] =df_train["TARGET"]
    
    return df_train_eng, df_test_eng, df_train, df_test

features = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
            "DAYS_BIRTH","DAYS_EMPLOYED", "NAME_EDUCATION_TYPE_Higher education",
            "ORGANIZATION_TYPE_XNA", "Birth_ANOMALY","FLOORSMAX_AVG"]

train_eng, test_eng, df_eng_train, df_eng_test = features_engd(features, train, test, degree=3)

Shape of the Engineered Treaining Data: (307511, 219)
Shape of the Engineered Test Data: (48744, 219)
First 15 Engineered Features: ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'NAME_EDUCATION_TYPE_Higher education', 'ORGANIZATION_TYPE_XNA', 'Birth_ANOMALY', 'FLOORSMAX_AVG', 'EXT_SOURCE_1^2', 'EXT_SOURCE_1 EXT_SOURCE_2', 'EXT_SOURCE_1 EXT_SOURCE_3', 'EXT_SOURCE_1 DAYS_BIRTH', 'EXT_SOURCE_1 DAYS_EMPLOYED', 'EXT_SOURCE_1 NAME_EDUCATION_TYPE_Higher education']
 Shape of train data before alinging: (307511, 457) 
 Shape of test data before alinging: (48744, 456)
 Shape of train data after alinging: (307511, 457) 
 Shape of test data after alinging: (48744, 456)


In [91]:
eng_fts = ["EXT_SOURCE_2 EXT_SOURCE_3","EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3",
           "EXT_SOURCE_2 EXT_SOURCE_3 DAYS_BIRTH", "EXT_SOURCE_2^2 EXT_SOURCE_3", "EXT_SOURCE_2 EXT_SOURCE_3^2"]
df_train_eng2, df_test_eng2, eng_train2, eng_test2 = features_engd(eng_fts, eng_train, eng_test, degree=3)

Shape of the Engineered Treaining Data: (307511, 55)
Shape of the Engineered Test Data: (48744, 55)
First 15 Engineered Features: ['EXT_SOURCE_2 EXT_SOURCE_3', 'EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3', 'EXT_SOURCE_2 EXT_SOURCE_3 DAYS_BIRTH', 'EXT_SOURCE_2^2 EXT_SOURCE_3', 'EXT_SOURCE_2 EXT_SOURCE_3^2', 'EXT_SOURCE_2 EXT_SOURCE_3^2', 'EXT_SOURCE_2 EXT_SOURCE_3 EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3', 'EXT_SOURCE_2 EXT_SOURCE_3 EXT_SOURCE_2 EXT_SOURCE_3 DAYS_BIRTH', 'EXT_SOURCE_2 EXT_SOURCE_3 EXT_SOURCE_2^2 EXT_SOURCE_3', 'EXT_SOURCE_2 EXT_SOURCE_3 EXT_SOURCE_2 EXT_SOURCE_3^2', 'EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3^2', 'EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 EXT_SOURCE_2 EXT_SOURCE_3 DAYS_BIRTH', 'EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 EXT_SOURCE_2^2 EXT_SOURCE_3', 'EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 EXT_SOURCE_2 EXT_SOURCE_3^2', 'EXT_SOURCE_2 EXT_SOURCE_3 DAYS_BIRTH^2']
 Shape of train data before alinging: (307511, 512) 
 Shape of test data before alinging: (48744, 511)
 Shape of

In [38]:
def log_reg(train, test, file_name= "log_reg", C=1):
# Here I am building a pipeline
    test = train["TARGET"].values
    train = train.drop("TARGET", axis=1).values

    test = pd.get_dummies(test).values
    pl = Pipeline([
        ("imputaton", Imputer(strategy="median")),
        ("scale", MinMaxScaler(feature_range=(0, 1))),
        ("clf", OneVsRestClassifier(estimator=LogisticRegression(C=C,class_weight=None,
                                                                 dual=False, fit_intercept=True,
                                                                 intercept_scaling=1, max_iter=100,
                                                                 multi_class='ovr', n_jobs=1, 
                                                                 penalty='l2', random_state=4,
                                                                 solver='liblinear', tol=0.0001, 
                                                                 verbose=0, warm_start=False)))
    ])
    pl.fit(trn,trt)
    proba = pl.predict_proba(test)
    submission = test[["SK_ID_CURR"]]
    submission["TARGET"]  =proba[:,1]
    submission = submission.to_csv(file_name + '.csv', index = False)


In [84]:
def log_reg_rocauc(train, C=1):
# Here I am building a pipeline
    test = train["TARGET"].values
    train = train.drop("TARGET", axis=1).values

    X_train, X_test, y_train, y_test = train_test_split(train,test,
                                                        test_size=0.2,
                                                        stratify=test,
                                                        random_state= 42)
    #trt = pd.get_dummies(trt).values
    pl = Pipeline([
        ("imputaton", Imputer(strategy="median")),
        ("scale", MinMaxScaler(feature_range=(0, 1))),
        ("clf", OneVsRestClassifier(estimator=LogisticRegression(C=C,class_weight=None,
                                                                 dual=False, fit_intercept=True,
                                                                 intercept_scaling=1, max_iter=100,
                                                                 multi_class='ovr', n_jobs=1, 
                                                                 penalty='l2', random_state=4,
                                                                 solver='liblinear', tol=0.0001, 
                                                                 verbose=0, warm_start=False)))
    ])
    pl.fit(X_train, y_train)
    y_proba = pl.predict_proba(X_test)[:,1]
    rocauc = roc_auc_score(y_test, y_proba)
    print("ROC_AUC Score is {}".format(rocauc))
    return rocauc
    #submission = test[["SK_ID_CURR"]]
    #submission["TARGET"]  =proba[:,1]
    #submission = submission.to_csv(file_name + '.csv', index = False)

In [72]:
test = train.TARGET

In [90]:
log_reg_rocauc(eng_train, C=1)

ROC_AUC Score is 0.7510089748120818


0.7510089748120818

In [93]:
log_reg_rocauc(eng_train2, C=1)

ROC_AUC Score is 0.7510514312629597


0.7510514312629597

In [136]:
# Import DecisionTreeClassifier from sklearn.tree
from sklearn.tree import DecisionTreeClassifier


def decision_tree_rocauc(train):
    test = train["TARGET"].values
    train = train.drop("TARGET", axis=1).values

    X_train, X_test, y_train, y_test = train_test_split(train,test,
                                                        test_size=0.2,
                                                        stratify=test,
                                                        random_state= 42)
    # Instantiating a DecisionTreeClassifier 'dt' with a maximum depth of 6
    dt = DecisionTreeClassifier(max_depth =6,criterion="entropy", random_state=43)
    
    pl = Pipeline([
    ("imputaton", Imputer(strategy="median")),
    ("scale", MinMaxScaler(feature_range=(0, 1))),
    ("clf", OneVsRestClassifier(estimator=dt))
    ])
    
    pl.fit(X_train, y_train)
    y_proba = pl.predict_proba(X_test)[:,1]
    rocauc = roc_auc_score(y_test, y_proba)
    print("ROC_AUC Score is {}".format(rocauc))
    return y_proba

# Diagnosing bias-variance problem: to see how well our model fits data(i.e. if our model underfits or overfits the data)

def diagnose_bias_varaince(train):
    imputer= Imputer(strategy ="mean")

    
    test = train["TARGET"]
    train = train.drop("TARGET", axis=1)
    

    X_train, X_test, y_train, y_test = train_test_split(train,test,
                                                        test_size=0.2,
                                                        stratify=test,
                                                        random_state= 42)
    pl = Pipeline([
    ("imputaton", Imputer(strategy="median")),
    ("scale", MinMaxScaler(feature_range=(0, 1)))
    ])
    
    X_train = pl.fit_transform(X_train)
    X_test = pl.transform(X_test)
    
    # Instantiating a DecisionTreeClassifier 'dt' with a maximum depth of 6
    
    dt = DecisionTreeClassifier(max_depth =19,criterion="gini",
                                 random_state=43, class_weight ="balanced")
    # Calculate cv scores
    ROC_CV_scores = cross_val_score(dt, X_train, y_train, cv=2, 
                                  scoring='roc_auc', 
                                  n_jobs=-1)
    #min_samples_leaf=0.1,
    # Compute the 10-folds ROC_CV
    ROC_CV = ROC_CV_scores
    ROC_CV_mean = ROC_CV_scores.mean()

    # Print ROC_CV
    print('ROC_CV_mean: {:.2f}'.format(ROC_CV_mean))
    
    # Fit dt to the training set
    dt.fit(X_train, y_train)

    # Predict the labels of the training set
    y_pred = dt.predict(X_train)
    
    ROC_train = roc_auc_score(y_train, y_pred)
    print("ROC_train: {:.2f}".format(ROC_train))
    
    #y_proba = pl.predict_proba(X_test)[:,1]

    return ROC_CV, ROC_CV_mean, ROC_train

In [121]:
#max_depth =7,criterion="gini",#min_samples_leaf=0.26,
diagnose_bias_varaince(df_eng_train)

ROC_CV_mean: 0.69
ROC_train: 0.50


(array([0.68965739, 0.68310577, 0.68644907, 0.6740724 , 0.67802664,
        0.69459941, 0.69250055, 0.68221918, 0.69067273, 0.69700731]),
 0.6868310462260004,
 0.5)

It looks like our model overfitted the data!

    Over fitting occurs when the model captures the noise and the outliers in the data along with the underlying pattern. These models usually have high variance and low bias. These models are usually complex like Decision Trees, SVM or Neural Networks which are prone to over fitting.

    Under fitting occurs when the model is unable to capture the underlying pattern of the data. These models usually have a low variance and a high bias. These models are usually simple which are unable to capture the complex patterns in the data like Linear and Logistic Regressions.


In [123]:
# Let's decrease the max_depth
#max_depth =5,criterion="gini",#min_samples_leaf=0.26,
diagnose_bias_varaince(df_eng_train)

ROC_CV_mean: 0.69
ROC_train: 0.50


(array([0.68965739, 0.68310577, 0.68644907, 0.6740724 , 0.67802664,
        0.69459941, 0.69250055, 0.68221918, 0.69067273, 0.69700731]),
 0.6868310462260004,
 0.5)

In [125]:
# Let's decrease the max_depth
#max_depth =12,criterion="gini",#min_samples_leaf=0.26,
diagnose_bias_varaince(df_eng_train)

ROC_CV_mean: 0.69
ROC_train: 0.50


(array([0.68965739, 0.68310577, 0.68644907, 0.6740724 , 0.67802664,
        0.69459941, 0.69250055, 0.68221918, 0.69067273, 0.69700731]),
 0.6868310462260004,
 0.5)

In [None]:
# Let's decrease the max_depth
#max_depth =3,criterion="gini",#min_samples_leaf=0.26,
diagnose_bias_varaince(df_eng_train)