In [None]:
# Read data
import numpy as np                           # Linear Algebra (calculate the mean and standard deviation)
import pandas as pd                          # manipulate data, data processing, load csv file I/O (e.g. pd.read_csv)

# Visualization
import matplotlib.pyplot as plt              # Visualization using matplotlib
%matplotlib inline
import seaborn as sns                        # Visualization using seaborn

# style
plt.style.use("fivethirtyeight")             # Set Graphs Background style using matplotlib
sns.set_style("darkgrid")                    # Set Graphs Background style using seaborn

import warnings                              # To ignore any warnings
warnings.filterwarnings("ignore")

In [None]:
# ML model building; Pre Processing & Evaluation
from sklearn.model_selection import train_test_split                     # split  data into training and testing sets
from sklearn.linear_model import LogisticRegression                      # LogisticRegression
from sklearn.tree import DecisionTreeClassifier                          # Decision tree Classifier
from sklearn.ensemble import RandomForestClassifier                      # this will make a Random Forest Classifier
import xgboost
from xgboost import XGBClassifier                                        # XGBoost Classifier
from sklearn.preprocessing import StandardScaler                         # Standard Scalar
from sklearn.metrics import confusion_matrix, classification_report      # this creates a confusion matrix
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV     # this will do cross validation

In [None]:
# Read train and test dataset
train = pd.read_csv("train_ctrUa4K.csv")
test = pd.read_csv("test_lAUu6dG.csv")

In [None]:
# Import first 5 rows
display(train.head())
display(test.head())

In [None]:
# checking dimension (num of rows and columns) of dataset
print("Training data shape (Rows, Columns):",train.shape)
print("Test data shape (Rows, Columns):",test.shape)

In [None]:
train_original=train.copy() 
test_original=test.copy()

In [None]:
# check dataframe structure like columns and its counts, datatypes & Null Values
display(train.info())
display(test.info())

In [None]:
display(train.dtypes.value_counts())
display(test.dtypes.value_counts())

In [None]:
# Gives number of data points in each variable
display(train.counts())
display(test.counts())

### Missing Values

In [None]:
display(train.isnull().sum())
display(test.isnull().sum())

<h2 style="color:blue" align="left"> 7. Model building and Evaluation </h2>

In [None]:
# Independant variable
X = train_new.drop('Loan_Status',axis=1)        # All rows & columns exclude Target features

# Dependant variable
y = train_new['Loan_Status']                   # Only target feature

In [None]:
# split  data into training and testing sets of 80:20 ratio
# 20% of test size selected
# random_state is random seed
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=4)

In [None]:
# shape of X & Y test / train
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

### Random Forest

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)

In [None]:
pred_rf = rf.predict(X_test)

In [None]:
print("Train Score {:.2f} & Test Score {:.2f}".format(rf.score(X_train, y_train), rf.score(X_test, y_test)))

### XGBOOST

In [None]:
reg_xgb = xgboost.XGBClassifier()
reg_xgb.fit(X_train, y_train)

In [None]:
# predicting X_test
y_pred_xgb = reg_xgb.predict(X_test)

In [None]:
print("Train Score {:.2f} & Test Score {:.2f}".format(reg_xgb.score(X_train,y_train),reg_xgb.score(X_test,y_test)))

### Submission

In [None]:
submission = pd.DataFrame({'Loan_ID': test_original['Loan_ID'], 'Loan_Status': y_pred_test})
submission.to_csv('Loan.csv', index=False)