In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns #Visualization
import matplotlib.pyplot as plt #Visualization
from sklearn.preprocessing import StandardScaler #Standardization
from sklearn.model_selection import train_test_split #Spliting
from sklearn.metrics import confusion_matrix 
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier


# Load Data

In [2]:
applications = pd.read_csv('../input/credit-card-approval-prediction/application_record.csv')
credit_record = pd.read_csv('../input/credit-card-approval-prediction/credit_record.csv')

# Info about the data

In [3]:
applications.info()

In [4]:
applications.head(10)

In [5]:
applications.FLAG_MOBIL.value_counts()

In [6]:
credit_record.info()

In [7]:
credit_record.head(10)

In [8]:
credit_record.STATUS.value_counts()

# Drop unwanted data

In [9]:
applications.drop( columns = ['FLAG_MOBIL'],inplace=True)
applications.dropna(subset=['OCCUPATION_TYPE'],inplace=True)
applications.drop_duplicates(subset=applications.columns[1:],inplace=True)

# Convert status values to binary (2,3,4,5 --> 1 'Bad'; Else 0 'Good')

In [10]:
credit_record['STATUS'] = np.where((credit_record['STATUS'] == '2') | (credit_record['STATUS'] == '3' )| (credit_record['STATUS'] == '4' )| (credit_record['STATUS'] == '5'), 1, 0)

In [11]:
credit_record.STATUS.value_counts()

# Create a DF with most recent month in each status for all applications

In [12]:
credit_classified = pd.DataFrame(pd.unique(credit_record.ID),columns = ['ID'])

In [13]:
credit_classified.head(10)

In [14]:
credit_classified.head()

In [15]:
credit_classified['Max_Mnth_Good'] = [max(credit_record[(credit_record.ID == i) & (credit_record.STATUS == 0)].MONTHS_BALANCE) for i in credit_classified.ID]
credit_classified['Max_Mnth_Bad'] = [max(credit_record[(credit_record.ID == i) & (credit_record.STATUS == 1)].MONTHS_BALANCE ,default=1) for i in credit_classified.ID]
credit_classified['Status'] = ["Good" if (credit_classified.Max_Mnth_Good.iloc[i] > credit_classified.Max_Mnth_Bad.iloc[i]) or (credit_classified.Max_Mnth_Bad.iloc[i] == 1) else "Bad" for i in range(len(credit_classified.ID))]

In [16]:
credit_classified.Status.value_counts()

In [17]:
credit_classified.head(10)

# Merge all data

In [18]:
merged_data = pd.merge(applications, credit_classified, how = "inner" , on='ID')

In [19]:
merged_data.describe()

# Handling Outliers

In [20]:
def dropOL(ftr):
    q75,q25 = np.percentile(merged_data[ftr],[75,25])
    intr_qr = q75-q25
    mx = q75+(1.5*intr_qr)
    mn = q25-(1.5*intr_qr)
    return mx,mn

In [21]:
merged_data.info()

# Split data to X, y

In [22]:
xData = merged_data[merged_data.columns[1:-3]]
yData = merged_data[merged_data.columns[-1]]

In [23]:
yData.value_counts()

# Encoding categorical values

In [24]:
xData = pd.get_dummies(xData,drop_first=True)

# Standardization

In [25]:
'''
std = StandardScaler()
std.fit(xData)
xScal = std.transform(xData)
xScal  = pd.DataFrame(xScal,columns=xData.columns)
xScal.head(10)
'''

# Find the Random State with required count of 'Bad' values

In [26]:
rndm_stat = [train_test_split(xData,yData,random_state=x) for x in range(100)]
badCounts = [rndm_stat[i][3].value_counts()['Bad'] for i in range(100)]
bstRndmStat = badCounts.index(4,8)
bstRndmStat

# Split into Training & Testing

In [27]:
X_train, X_test, y_train, y_test = train_test_split(xData,yData,random_state=bstRndmStat)

# Under-sampling the 'Good' status

In [28]:
tl = TomekLinks(sampling_strategy='majority')
x_tl, y_tl = tl.fit_resample(X_train,y_train)
print('Original dataset shape', y_train.value_counts())
print('Resample dataset shape', y_tl.value_counts())

# Oversampling the 'Bad' status

In [29]:
smote = SMOTE()
x_smote, y_smote = smote.fit_resample(x_tl, y_tl)
print('Original dataset shape', y_tl.value_counts())
print('Resample dataset shape', y_smote.value_counts())

# Random Forest Classification

In [30]:
'''
rndSt = TN = 10
for i in range(100):
    clf = RandomForestClassifier(max_depth=2, random_state=i)
    clf.fit(x_smote, y_smote)
    y_pred = clf.predict(X_test)
    if confusion_matrix(y_test, y_pred)[0][0] > TN:
        TN = confusion_matrix(y_test, y_pred)[0][0]
        rndSt = i

print(clf.score(X_test, y_test))
'''

In [31]:
clf = RandomForestClassifier(max_depth=5, random_state=10)
clf.fit(x_smote, y_smote)
y_pred = clf.predict(X_test)
print(clf.score(X_test, y_test))

In [32]:
print(X_test)

# Plot the features importance for Random Forest

In [33]:
def plot_feature_importance(importance,names,model_type):

#Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)

#Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

#Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

#Define size of bar plot
    plt.figure(figsize=(10,15))
#Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
#Add chart labels
    plt.title(model_type + ' FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')

In [34]:
plot_feature_importance(clf.feature_importances_,X_train.columns,'RANDOM FOREST')