# Importing Required Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import linear_model
import scipy.stats as stat


sns.set()

# Reading the Datasets

## Features

#### Discrete Training Features

In [2]:
training_features_dis_df_backup = pd.read_csv('training_features_df.csv')
training_features_dis_df = training_features_dis_df_backup.copy()
training_features_dis_df = training_features_dis_df.set_index(['Unnamed: 0'])

#### Discrete Testing Features

In [3]:
testing_features_dis_df_backup = pd.read_csv('testing_features_df.csv')
testing_features_dis_df = testing_features_dis_df_backup.copy()
testing_features_dis_df = testing_features_dis_df.set_index(['Unnamed: 0'])

#### Continuous Training Features

In [4]:
training_features_con_df_backup = pd.read_csv('training_features_df_continuous.csv')
training_features_con_df = training_features_con_df_backup.copy()
training_features_con_df = training_features_con_df.set_index(['Unnamed: 0'])

#### Continuous Testing Features

In [5]:
testing_features_con_df_backup = pd.read_csv('testing_features_df_continuous.csv')
testing_features_con_df = testing_features_con_df_backup.copy()
testing_features_con_df = testing_features_con_df.set_index(['Unnamed: 0'])

## Targets

In [6]:
loan_status_df_backup = pd.read_csv('Columns/loan_status.csv')
loan_status_df = pd.DataFrame(loan_status_df_backup['loan_status'])

loan_status_df_dummies = pd.DataFrame(np.where(
    (loan_status_df['loan_status'] == 'Current') | (loan_status_df['loan_status'] == 'Fully Paid') | (loan_status_df['loan_status'] == 'Late (16-30 days)') | (loan_status_df['loan_status'] == 'In Grace Period') | (loan_status_df['loan_status'] == 'Does not meet the credit policy. Status:Fully Paid'), 1, 0), 
                                      columns=['loan_status:Not_Default'])

train_indices, test_indices, train_targets, test_targets = train_test_split(loan_status_df_dummies.index, loan_status_df_dummies['loan_status:Not_Default'], random_state=42, test_size=0.2)

print("Trainig Indices shape - ", train_indices.shape)
print("Trainig Targets(loan_status:Not_Default) shape - ", train_targets.shape)
print("Testing Indices shape - ", test_indices.shape)
print("Testing Targets(loan_status:Not_Default) shape - ", test_targets.shape)

Trainig Indices shape -  (1808534,)
Trainig Targets(loan_status:Not_Default) shape -  (1808534,)
Testing Indices shape -  (452134,)
Testing Targets(loan_status:Not_Default) shape -  (452134,)


In [7]:
train_targets_df = pd.DataFrame(train_targets.copy())
test_targets_df = pd.DataFrame(test_targets.copy())

# Building Initial Logistic Regression Model

#### Reference Categories

In [8]:
model_features = [
    'grade:A',
    'grade:B',
    'grade:C',
    'grade:D',
    'grade:E',
    'grade:F',
    'grade:G',
    'home_ownership:OTHER',
    'home_ownership:RENT_NONE_OWN',
    'home_ownership:MORTGAGE',
    'home_ownership:ANY',
    'addr_state:IA',
    'addr_state:AL_AR_MS_OK_LA_NV_NM_HI',
    'addr_state:NY',
    'addr_state:SD',
    'addr_state:FL',
    'addr_state:MO_MD_NC_PA_KY_TN',
    'addr_state:CA',
    'addr_state:NJ_MI_VA_MN_AK_AZ_NE_OH',
    'addr_state:TX',
    'addr_state:DE_MA_UT_GA_RI_WY_IL_MT_KS',
    'addr_state:CT_ND_WA_CO_SC_WV_OR_NH_DC',
    'addr_state:ID_VT',
    'addr_state:ME',
    'verification_status:Verified',
    'verification_status:Source Verified',
    'verification_status:Not Verified',
    'purpose:debt_consolidation',
    'purpose:educational_small_business',
    'purpose:renewable_energy_moving',
    'purpose:medical_wedding',
    'purpose:other',
    'purpose:house_vacation_major_purchase',
    'purpose:home_improvement',
    'purpose:credit_card',
    'purpose:car',
    'initial_list_status:f',
    'initial_list_status:w',
    'term:36',
    'term:60',
    'mths_issue_d:12_16',
    'mths_issue_d:16_19',
    'mths_issue_d:19_22',
    'mths_issue_d:22_24',
    'mths_issue_d:24_26',
    'mths_issue_d:26_30',
    'mths_issue_d:30_35',
    'mths_issue_d:35_40',
    'mths_issue_d:40_70',
    'mths_issue_d:70_93',
    'mths_issue_d:93+',
    'int_rate:5_7',
    'int_rate:7_11',
    'int_rate:11_15',
    'int_rate:15_19',
    'int_rate:19_23',
    'int_rate:23+',
    'funded_amnt:460_2475',
    'funded_amnt:2475_4450',
    'funded_amnt:4450_10375',
    'funded_amnt:10375_16300',
    'funded_amnt:16300_36050',
    'funded_amnt:36050+',
    'annual_inc:18500-',
    'annual_inc:18500_37500',
    'annual_inc:37500_65500',
    'annual_inc:65500_102500',
    'annual_inc:102500_140000',
    'annual_inc:140000+',
    'installment:0_100',
    'installment:100_250',
    'installment:250_500',
    'installment:500_1000',
    'installment:1000_1300',
    'installment:1300+',
    'inq_last_6mths:0',
    'inq_last_6mths:1',
    'inq_last_6mths:2',
    'inq_last_6mths:2+',
    'dti:0_4',
    'dti:4_8',
    'dti:8_12',
    'dti:12_16',
    'dti:16_20',
    'dti:20_24',
    'dti:24_28',
    'dti:28_32',
    'dti:32_36',
    'dti:36_40',
    'dti:40+'
]

ref_cat = [
    'grade:G',
    'home_ownership:OTHER',
    'addr_state:IA',
    'verification_status:Verified',
    'purpose:educational_small_business',
    'initial_list_status:f',
    'term:60',
    'mths_issue_d:93+',
    'int_rate:23+',
    'funded_amnt:16300_36050',
    'annual_inc:18500-',
    'installment:1300+',
    'inq_last_6mths:2+',
    'dti:40+'
]

#### Defining Data

In [9]:
X_train = pd.concat((training_features_dis_df, training_features_con_df), axis=1)
X_train = X_train[model_features]
X_train = X_train.drop(ref_cat, axis=1)

X_test = pd.concat((testing_features_dis_df, testing_features_con_df), axis=1)
X_test = X_test[model_features]
X_test = X_test.drop(ref_cat, axis=1)

y_train = train_targets_df['loan_status:Not_Default']
y_test = test_targets_df['loan_status:Not_Default']

In [15]:
X_train.to_csv('X_train.csv', index=True)
X_test.to_csv('X_test.csv', index=True)
y_train.to_csv('y_train.csv', index=True)
y_test.to_csv('y_test.csv', index=True)

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
