 <h1 style="color:blue;"> Scenario 7 notebook</h1>



- C1S7.Py01	How to create new variables
- C1S7.Py02	Feature engineering errors and how to correct
- C1S7.Py03	Coding debt to income ratio with error catches
- C1S7.Py04	Multiple regression with residuals

In [None]:
#Code Block 1

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns



#style options

%matplotlib inline
#if you want graphs to automatically without plt.show

pd.set_option('display.max_columns',500) #allows for up to 500 columns to be displayed when viewing a dataframe

plt.style.use('seaborn') #a style that can be used for plots - see style reference above



In [None]:
#Code Block 2
df = pd.read_csv('data/Scenario7.csv', index_col = 0, header=0)
    #DOES set the first column to the index
    # and the top row as the headers

In [None]:
#Code Block 3
df.head(3)

 <h2 style="color:blue;">C1.S7.Py01 - How to create new variables</h2>

<h3 style="color:blue;">  Creating a ratio of loan amount to income  </h3>  

In [None]:
#Code Block 4
df.head()

In [None]:
#Code Block 5

#Create a random sample of 1000
df_sample = df.sample(1000)

df_sample = df_sample[df_sample['Annual Income']<400000]

# Create a custom scatterplot
sns.set(style='whitegrid')
plt.figure(figsize=(20,10)) #changes area of scatterplot
sns.scatterplot(y='Annual Income', x='Amount Funded', data=df_sample, alpha=.5, s = 250, edgecolor='white', linewidth=2, hue='Home Ownership')
plt.title('Seaborn Scatter plot', color = 'green', fontsize='18')
plt.xlabel('Amount Funded', color = 'red', fontsize='14')
plt.ylabel('Annual Income', color = 'red', fontsize='14')
plt.show()

<h3 style="color:red;">Why use an underscore in the column name?</h3>

In [None]:
#Code Block 6
df['Loan_Income'] = df['Amount Funded'] / df['Annual Income']
df.head()

<h3 style="color:blue;">  How to overwrite an existing column </h3>  

In [None]:
#Code Block 7

df['Loan_Income'] = 1
df[['Amount Funded', 'Annual Income', 'Loan_Income']].head()

In [None]:
#Code Block 8

df['Loan_Income'] = df['Amount Funded'] / df['Annual Income']
df[['Amount Funded', 'Annual Income', 'Loan_Income']].head()

<h3 style="color:blue;">  How to drop a column </h3>

In [None]:
#Code Block 9

df = df.drop('Loan_Income', axis = 1)
df.head(2)

In [None]:
#Code Block 10

df['Loan_Income'] = df['Amount Funded'] / df['Annual Income']

<h2 style="color:blue;">C1.S7.Py02 - Feature engineering errors and how to correct</h2>

In [None]:
#Code Block 11

df.head(10)

In [None]:
#Code Block 12

#This will cause an error
(x,y) = (5,0)
z = x/y
z

In [None]:
#Code Block 13

(x,y) = (5,10)
try:
    z = x / y
except ZeroDivisionError:
    print ("divide by zero")


<h3 style="color:red;">  Divide by zero denominator in pandas </h3>  

In [None]:
#Code Block 14
df['Debt_Account'] = df['Total Debt'] / df['Open Accounts']

In [None]:
#Code Block 15
df['Debt_Delinquent'] = df['Total Debt'] / df['Delinquencies Past 24 Months']

In [None]:
#Code Block 16
df[['Debt_Account', 'Debt_Delinquent']].describe()

<h3 style="color:blue;">How to change from scientific notation to standard format</h3>  

In [None]:
#Code Block 17
round(df[['Debt_Account', 'Debt_Delinquent']].describe(),2)

<h3 style="color:blue;"> Using a function for flexibility  </h3>  

In [None]:
#Code Block 18
def delinquent(c):
  if pd.isna(c['Delinquencies Past 24 Months']):
    return 1
  else:
    return 2

df['change'] = df.apply(delinquent, axis=1)

In [None]:
#Code Block 19

# to see the counts for 1 and 2 for 'change'
df['change'].value_counts()

In [None]:
#Code Block 20

# drop the column since it is not needed
df = df.drop(['Debt_Account', 'Debt_Delinquent','change'], axis = 1)


<h2 style="color:blue;">C1.S7.Py03 - Coding debt to income ratio with error catches</h2>

In [None]:
#Code Block 21

df['Debt_Income'] = df['Total Debt'] / df['Annual Income']
df.info()

<h3 style="color:blue;">Are there any ['Total Debt'] values == 0?</h3>  

In [None]:
#Code Block 22
display(df[df['Total Debt']==0])
print('----------------------------------')
print('Total Debt - Null Values (NaNs)')
display(df[df['Total Debt'].isnull()])

<h3 style="color:green;">Different options for Total Debt create Debt_Income</h3>  

- **OPTION 1**: Leave Total Debt as **null** and fill in Debt_Income with a 0.
- **OPTION 2**: Fill in NaN values for Total Debt with 0, if Total Debt that is **null** means the Member does not have debt  

<h3 style="color:green;">OPTION 1: Leave Total Debt as null and fill in Debt_Income with a 0</h3>  

In [None]:
#Code Block 23

df_opt1 = df[['Interest Rate', 'Amount Funded', 'Total Debt' ,'Annual Income', 'Loan_Income']].copy()
#Add .copy() so that is a separate slice of df

In [None]:
#Code Block 24

#function for evaluating a property with an if/then
def debtincome(c):
  if pd.isna(c['Total Debt']):
    return 0
  else:
    return c['Total Debt'] / c['Annual Income']

df_opt1['Debt_Income'] = df.apply(debtincome, axis=1)
df_opt1.head()

In [None]:
df_opt1[df_opt1['Total Debt'].isnull()]

<h3 style="color:green;">OPTION 2: Fill in NaN values for Total Debt with 0, if Total Debt that is null means the Member does not have debt  </h3>  

In [None]:
#Code Block 25
df_opt2 = df[['Interest Rate', 'Amount Funded', 'Total Debt' ,'Annual Income', 'Loan_Income']].copy()

In [None]:
#Code Block 26
df_opt2['Total Debt'] = df_opt2['Total Debt'].fillna(0)

In [None]:
#Code Block 27
df_opt2[df_opt2['Total Debt']==0]

In [None]:
#Code Block 28
df_opt2['Debt_Income'] = df_opt2['Total Debt'] / df_opt2['Annual Income']
df_opt2[df_opt2['Total Debt']<20] #to show Members with and without Total Debt == 0

In [None]:
## Code Block 29
df_opt2.info()

<h2 style="color:blue;">C1.S7.Py04 - Multiple regression with residuals</h2>

<h3 style="color:blue;">Use OPTION 1 (df_opt1) for the multiple regression </h3>  

In [None]:
#Code Block 30
display(df_opt1.head())
df_opt1.info()

<h3 style="color:red;">CAUTION: The Code Block below will cause an error</h3>  

In [None]:
#Code Block 31
import statsmodels
import statsmodels.api as sm


X = df_opt1[['Amount Funded', 'Annual Income', 'Total Debt', 'Loan_Income', 'Debt_Income']]
y = df_opt1['Interest Rate']
X = sm.add_constant(X) # adding a constant

reg_opt1 = sm.OLS(y, X).fit()

predictions_opt1 = reg_opt1.predict(X)
resid_opt1 = reg_opt1.resid
reg_opt1.summary()


<h3 style="color:red;">What caused the error?  </h3>  

- Total Debt still has NaN values because we left the NaN values alone and placed a 0 in Debt_Income

In [None]:
#Code Block 32
df_opt1.info()

<h3 style="color:blue;">For this regression to work we need to fill in NaNs or drop NaNs rows.  </h3>  

In [None]:
#Code Block 33
df_opt1 = df_opt1.dropna()
df_opt1.info()

In [None]:
#Code Block 34
import statsmodels
import statsmodels.api as sm

In [None]:
#Code Block 35
X = df_opt1[['Amount Funded', 'Annual Income', 'Total Debt', 'Loan_Income', 'Debt_Income']]
y = df_opt1['Interest Rate']
X = sm.add_constant(X) # adding a constant

reg_opt1 = sm.OLS(y, X).fit()

predictions_opt1 = reg_opt1.predict(X)
resid_opt1 = reg_opt1.resid
reg_opt1.summary()


In [None]:
#Code Block 36
df_reg_results = pd.concat([df_opt1, predictions_opt1, resid_opt1], axis=1)
df_reg_results=df_reg_results.rename(columns = {0:'Int_Pred_opt1', 1:'Resid_opt1'})
df_reg_results.head(10)

In [None]:
#Code Block 37
plt.figure(figsize=(20,10)) #changes area of regplot
sns.regplot(x='Int_Pred_opt1', y='Resid_opt1',
              data = df_reg_results, scatter_kws={"color":"blue","alpha":0.15, "s":100,"linewidth":2,"edgecolor":"white"},
              line_kws={'color': 'black'})


In [None]:
#Code Block 38
plt.figure(figsize=(20,10)) #changes area of regplot
sns.regplot(x='Int_Pred_opt1', y='Resid_opt1',
              data = df_reg_results, scatter_kws={"color":"blue","alpha":0.15, "s":100,"linewidth":2,"edgecolor":"white"},
              line_kws={'color': 'black'})
plt.xlim(7, 18)
plt.ylim(-11, 14)

In [None]:
#Code Block 39
df_opt2.info()

In [None]:
#Code Block 40

X = df_opt2[['Amount Funded', 'Annual Income', 'Total Debt', 'Loan_Income', 'Debt_Income']]
y = df_opt2['Interest Rate']
X = sm.add_constant(X) # adding a constant

reg_opt2 = sm.OLS(y, X).fit()

predictions_opt2 = reg_opt2.predict(X)
resid_opt2 = reg_opt2.resid
reg_opt2.summary()

In [None]:
#Code Block 41

df_reg_results2 = pd.concat([df_opt2, predictions_opt2, resid_opt2], axis=1)
df_reg_results2=df_reg_results2.rename(columns = {0:'Int_Pred_opt2', 1:'Resid_opt2'})
df_reg_results2.head(10)

In [None]:
#Code Block 42

plt.figure(figsize=(20,10)) #changes area of regplot
sns.regplot(x='Int_Pred_opt2', y='Resid_opt2',
              data = df_reg_results2, scatter_kws={"color":"green","alpha":0.15, "s":100,"linewidth":2,"edgecolor":"white"},
              line_kws={'color': 'black'})
plt.xlim(7, 18)
plt.ylim(-11, 14)

In [None]:
#Code Block 43

sns.set(style='dark')
plt.figure(figsize=(20,14))

#top left
ax1 = plt.subplot2grid((2, 2), (0, 0))
plt.title('Option 1: Full Residual Plot', fontweight='bold', color = 'blue', fontsize='17', horizontalalignment='center')
ax1 = sns.regplot(x='Int_Pred_opt1', y='Resid_opt1',
              data = df_reg_results, scatter_kws={"color":"blue","alpha":0.15, "s":100,"linewidth":2,"edgecolor":"white"},
              line_kws={'color': 'black'})


#top right
ax2 = plt.subplot2grid((2, 2), (0, 1))
plt.title('Option 2: Full Residual Plot', fontweight='bold', color = 'green', fontsize='17', horizontalalignment='center')
ax2 = sns.regplot(x='Int_Pred_opt2', y='Resid_opt2',
              data = df_reg_results2, scatter_kws={"color":"green","alpha":0.15, "s":100,"linewidth":2,"edgecolor":"white"},
              line_kws={'color': 'black'})


#bottom left
ax3 = plt.subplot2grid((2, 2), (1, 0))
plt.title('Option 1: Limit Residual Plot', fontweight='bold', color = 'blue', fontsize='17', horizontalalignment='center')
ax3 = sns.regplot(x='Int_Pred_opt1', y='Resid_opt1',
              data = df_reg_results, scatter_kws={"color":"blue","alpha":0.15, "s":100,"linewidth":2,"edgecolor":"white"},
              line_kws={'color': 'black'})
plt.xlim(7, 18)
plt.ylim(-11, 14)

#bottom right
ax4 = plt.subplot2grid((2, 2), (1, 1))
plt.title('Option 2: Limit Residual Plot', fontweight='bold', color = 'green', fontsize='17', horizontalalignment='center')
ax4 = sns.regplot(x='Int_Pred_opt2', y='Resid_opt2',
              data = df_reg_results2, scatter_kws={"color":"green","alpha":0.15, "s":100,"linewidth":2,"edgecolor":"white"},
              line_kws={'color': 'black'})
plt.xlim(7, 18)
plt.ylim(-11, 14)