In [4]:
import numpy as np
import pandas as pd
import os
from scipy.stats import skew
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
import matplotlib

#setting the directory on my computer
os.chdir('/Users/zihan/Desktop')


In [5]:
train = pd.read_csv('financetrain.csv',delimiter =",")
train_new = train.iloc[:, 0:21]

In [6]:
#check the missing values 
missing_data = pd.DataFrame({'Missing Ratio' :((train_new.isnull().sum() / len(train_new)) * 100)})
missing_data

Unnamed: 0,Missing Ratio
SK_ID_CURR,0.0
TARGET,0.0
NAME_CONTRACT_TYPE,0.0
CODE_GENDER,0.0
FLAG_OWN_CAR,0.0
FLAG_OWN_REALTY,0.0
CNT_CHILDREN,0.0
AMT_INCOME_TOTAL,0.0
AMT_CREDIT,0.0
AMT_ANNUITY,0.003902


In [7]:
#filling missing values of features with mode
train_new['AMT_ANNUITY'] = train_new['AMT_ANNUITY'].fillna(train_new['AMT_ANNUITY'].mode()[0])
train_new['AMT_GOODS_PRICE'] = train_new['AMT_GOODS_PRICE'].fillna(train_new['AMT_GOODS_PRICE'].mode()[0])

In [8]:
#check the columns and drop the useless ID columns
print(train_new.columns.values)
train_new.drop(columns = ['SK_ID_CURR'],inplace = True)

#drop the column with too many missing values
train_new.drop(columns = ['NAME_TYPE_SUITE'],inplace = True)

#split independent variables and target variable
x_train = train_new.drop(['TARGET'], axis=1)
y_train = train['TARGET'].reset_index(drop=True)

['SK_ID_CURR' 'TARGET' 'NAME_CONTRACT_TYPE' 'CODE_GENDER' 'FLAG_OWN_CAR'
 'FLAG_OWN_REALTY' 'CNT_CHILDREN' 'AMT_INCOME_TOTAL' 'AMT_CREDIT'
 'AMT_ANNUITY' 'AMT_GOODS_PRICE' 'NAME_TYPE_SUITE' 'NAME_INCOME_TYPE'
 'NAME_EDUCATION_TYPE' 'NAME_FAMILY_STATUS' 'NAME_HOUSING_TYPE'
 'REGION_POPULATION_RELATIVE' 'DAYS_BIRTH' 'DAYS_EMPLOYED'
 'DAYS_REGISTRATION' 'DAYS_ID_PUBLISH']


In [9]:
train_new.dtypes

TARGET                          int64
NAME_CONTRACT_TYPE             object
CODE_GENDER                    object
FLAG_OWN_CAR                   object
FLAG_OWN_REALTY                object
CNT_CHILDREN                    int64
AMT_INCOME_TOTAL              float64
AMT_CREDIT                    float64
AMT_ANNUITY                   float64
AMT_GOODS_PRICE               float64
NAME_INCOME_TYPE               object
NAME_EDUCATION_TYPE            object
NAME_FAMILY_STATUS             object
NAME_HOUSING_TYPE              object
REGION_POPULATION_RELATIVE    float64
DAYS_BIRTH                      int64
DAYS_EMPLOYED                   int64
DAYS_REGISTRATION             float64
DAYS_ID_PUBLISH                 int64
dtype: object

In [10]:
train_new.head()

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH
0,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Working,Secondary / secondary special,Single / not married,House / apartment,0.018801,-9461,-637,-3648.0,-2120
1,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,State servant,Higher education,Married,House / apartment,0.003541,-16765,-1188,-1186.0,-291
2,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,Working,Secondary / secondary special,Single / not married,House / apartment,0.010032,-19046,-225,-4260.0,-2531
3,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,Working,Secondary / secondary special,Civil marriage,House / apartment,0.008019,-19005,-3039,-9833.0,-2437
4,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,Working,Secondary / secondary special,Single / not married,House / apartment,0.028663,-19932,-3038,-4311.0,-3458


In [11]:
missing_data = pd.DataFrame({'Missing Ratio' :((train_new.isnull().sum() / len(train_new)) * 100)})
missing_data

Unnamed: 0,Missing Ratio
TARGET,0.0
NAME_CONTRACT_TYPE,0.0
CODE_GENDER,0.0
FLAG_OWN_CAR,0.0
FLAG_OWN_REALTY,0.0
CNT_CHILDREN,0.0
AMT_INCOME_TOTAL,0.0
AMT_CREDIT,0.0
AMT_ANNUITY,0.0
AMT_GOODS_PRICE,0.0


In [12]:
#log transform the AMT CREDIT, AMT_ANNUITY and AMT_GOODS_PRICE
train["AMT_CREDIT"] = np.log1p(train["AMT_CREDIT"])
train["AMT_ANNUITY"] = np.log1p(train["AMT_ANNUITY"])
train["AMT_GOODS_PRICE "] = np.log1p(train["AMT_GOODS_PRICE"])

In [13]:
missing_data = pd.DataFrame({'Missing Ratio' :((x_train.isnull().sum() / len(train_new)) * 100)})
missing_data

Unnamed: 0,Missing Ratio
NAME_CONTRACT_TYPE,0.0
CODE_GENDER,0.0
FLAG_OWN_CAR,0.0
FLAG_OWN_REALTY,0.0
CNT_CHILDREN,0.0
AMT_INCOME_TOTAL,0.0
AMT_CREDIT,0.0
AMT_ANNUITY,0.0
AMT_GOODS_PRICE,0.0
NAME_INCOME_TYPE,0.0


In [14]:
#get the dummy categorical features
x_dummy = pd.get_dummies(x_train).reset_index(drop=True)

In [15]:
#check the missing values 
missing_data = pd.DataFrame({'Missing Ratio' :((x_dummy.isnull().sum() / len(x_dummy)) * 100)})
missing_data

Unnamed: 0,Missing Ratio
CNT_CHILDREN,0.0
AMT_INCOME_TOTAL,0.0
AMT_CREDIT,0.0
AMT_ANNUITY,0.0
AMT_GOODS_PRICE,0.0
REGION_POPULATION_RELATIVE,0.0
DAYS_BIRTH,0.0
DAYS_EMPLOYED,0.0
DAYS_REGISTRATION,0.0
DAYS_ID_PUBLISH,0.0


In [16]:
# Using random forest to get feature importance
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(random_state=7)
forest.fit(x_dummy, y_train) 
importances = pd.Series(forest.feature_importances_, index = x_dummy.columns)

#plot the importance of features
matplotlib.rcParams['figure.figsize'] = (9.0, 9.0)
imp = pd.concat([importances.sort_values()])
imp.plot(kind = "barh")
print("RF picked " + str(sum(importances != 0)) + " variables and assign importances of the other " +  str(sum(importances == 0)) + " variables to zero")

RF picked 40 variables and assign importances of the other 4 variables to zero


In [41]:
fea = importances.sort_values().head(7)
fea.index

Index(['NAME_INCOME_TYPE_Businessman', 'NAME_INCOME_TYPE_Student',
       'NAME_FAMILY_STATUS_Unknown', 'CODE_GENDER_XNA',
       'NAME_EDUCATION_TYPE_Academic degree',
       'NAME_INCOME_TYPE_Maternity leave', 'NAME_INCOME_TYPE_Unemployed'],
      dtype='object')

In [42]:
# feature selection
from sklearn.feature_selection import SelectFromModel
model = SelectFromModel(forest, prefit=True, max_features=30)
feature_idx = model.get_support()
feature_names = x_dummy.columns[feature_idx]
x_train_new = model.transform(x_dummy)
pd.DataFrame(x_train_new, columns= feature_names)
print (feature_names)

Index(['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE',
       'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH'],
      dtype='object')


In [43]:
x_dummy.drop(columns = ['NAME_INCOME_TYPE_Businessman', 'NAME_INCOME_TYPE_Student',
       'NAME_FAMILY_STATUS_Unknown', 'CODE_GENDER_XNA',
       'NAME_EDUCATION_TYPE_Academic degree',
       'NAME_INCOME_TYPE_Maternity leave', 'NAME_INCOME_TYPE_Unemployed'],inplace = True)

In [44]:
x_dummy.head()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,NAME_FAMILY_STATUS_Married,NAME_FAMILY_STATUS_Separated,NAME_FAMILY_STATUS_Single / not married,NAME_FAMILY_STATUS_Widow,NAME_HOUSING_TYPE_Co-op apartment,NAME_HOUSING_TYPE_House / apartment,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Office apartment,NAME_HOUSING_TYPE_Rented apartment,NAME_HOUSING_TYPE_With parents
0,0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461,-637,-3648.0,-2120,...,0,0,1,0,0,1,0,0,0,0
1,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765,-1188,-1186.0,-291,...,1,0,0,0,0,1,0,0,0,0
2,0,67500.0,135000.0,6750.0,135000.0,0.010032,-19046,-225,-4260.0,-2531,...,0,0,1,0,0,1,0,0,0,0
3,0,135000.0,312682.5,29686.5,297000.0,0.008019,-19005,-3039,-9833.0,-2437,...,0,0,0,0,0,1,0,0,0,0
4,0,121500.0,513000.0,21865.5,513000.0,0.028663,-19932,-3038,-4311.0,-3458,...,0,0,1,0,0,1,0,0,0,0


In [45]:
missing_data = pd.DataFrame({'Missing Ratio' :((x_dummy.isnull().sum() / len(x_dummy)) * 100)})
missing_data

Unnamed: 0,Missing Ratio
CNT_CHILDREN,0.0
AMT_INCOME_TOTAL,0.0
AMT_CREDIT,0.0
AMT_ANNUITY,0.0
AMT_GOODS_PRICE,0.0
REGION_POPULATION_RELATIVE,0.0
DAYS_BIRTH,0.0
DAYS_EMPLOYED,0.0
DAYS_REGISTRATION,0.0
DAYS_ID_PUBLISH,0.0


In [46]:
x_dummy.describe()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,NAME_FAMILY_STATUS_Married,NAME_FAMILY_STATUS_Separated,NAME_FAMILY_STATUS_Single / not married,NAME_FAMILY_STATUS_Widow,NAME_HOUSING_TYPE_Co-op apartment,NAME_HOUSING_TYPE_House / apartment,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Office apartment,NAME_HOUSING_TYPE_Rented apartment,NAME_HOUSING_TYPE_With parents
count,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,...,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0
mean,0.417052,168797.9,599026.0,27107.867258,538316.3,0.020868,-16036.995067,63815.045904,-4986.120328,-2994.202373,...,0.63878,0.06429,0.14778,0.052317,0.003649,0.887344,0.036366,0.00851,0.015873,0.048258
std,0.722121,237123.1,402490.8,14493.89595,369289.0,0.013831,4363.988632,141275.766519,3522.886321,1509.450419,...,0.480355,0.24527,0.354882,0.222666,0.060294,0.316173,0.1872,0.091858,0.124983,0.214312
min,0.0,25650.0,45000.0,1615.5,40500.0,0.00029,-25229.0,-17912.0,-24672.0,-7197.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,112500.0,270000.0,16524.0,238500.0,0.010006,-19682.0,-2760.0,-7479.5,-4299.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,0.0,147150.0,513531.0,24903.0,450000.0,0.01885,-15750.0,-1213.0,-4504.0,-3254.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,1.0,202500.0,808650.0,34596.0,679500.0,0.028663,-12413.0,-289.0,-2010.0,-1720.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
max,19.0,117000000.0,4050000.0,258025.5,4050000.0,0.072508,-7489.0,365243.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [47]:
y_train.describe()


count    307511.000000
mean          0.080729
std           0.272419
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: TARGET, dtype: float64

In [48]:
# k-fold validation
kfolds = KFold(n_splits=10, shuffle=True, random_state=77)

def cv_rmse(model):
    rmse = np.sqrt(-cross_val_score(model, x_dummy, y_train, scoring="neg_mean_squared_error", cv=kfolds))
    return (rmse)

In [49]:
# Lasso regression model
alphas_lasso = [0.0001, 0.0005, 0.001, 0.005, 0.01]
lasso = LassoCV(max_iter=1e5, alphas=alphas_lasso, random_state=77, cv=kfolds)
# calculate the score of lasso
score_lasso = cv_rmse(lasso)

print("LASSO: {:.4f} ({:.4f})\n".format(score_lasso.mean(), score_lasso.std()), )


LASSO: 0.2696 (0.0030)

