In [None]:
# We will be tring to predict whether a person will default on a credit card.

In [None]:
## This function is used to plot the decision boundary in two dimensions.
def plot_decision_boundary(est, x_1, x_2, ax=None, threshold=0.0, contourf=False):
    """Plots the decision surface of ```est`` on features ``x1`` and ``x2``."""
    xx1, xx2 = np.meshgrid(np.linspace(x_1.min(), x_1.max(), 100),
                           np.linspace(x_2.min(), x_2.max(), 100))
    # plot the hyperplane by evaluating the parameters on the grid
    X_pred = np.c_[xx1.ravel(), xx2.ravel()] # convert 2d grid into seq of points
    if hasattr(est, 'predict_proba'): # check if ``est`` supports probabilities
        # take probability of positive class
        pred = est.predict_proba(X_pred)[:, 1]
    else:
        pred = est.predict(X_pred)
    Z = pred.reshape((100, 100)) # reshape seq to grid
    if ax is None:
        ax = plt.gca()
        
    # plot line via contour plot
    
    if contourf:
        ax.contourf(xx1, xx2, Z, levels=np.linspace(0, 1.0, 10), cmap=plt.cm.RdBu, alpha=0.6)
    ax.contour(xx1, xx2, Z, levels=[threshold], colors='black')
    
    ax.set_xlim((x_1.min(), x_1.max()))
    ax.set_ylim((x_2.min(), x_2.max()))

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from matplotlib.colors import ListedColormap

%matplotlib inline

df = pd.read_csv(

print df.columns

# Let's quickly describe the data
df.describe()
    

In [None]:
# It's sometimes useful to use a cross-tab to calculate relationships between columns.
pd.crosstab(df['student'], df['default']).rename(columns={0: 'No', 1: 'Yes'})

In [None]:
# Note that the populations of non=default vs. defailted credit cards is very asymmetrical.
# For simplicity in this exersise, we'll downsample observations that haven't defaulted.
indices = np.where(df.default == 0)[0]
rng = np.random.RandomState(13)

In [None]:
# Convert everything to numeric before splitting
d.student = np.where(d.studennt == 'Yes', 1, 0)

# 2 - Split the data into train and test sets
X = d[['balance', 'student', 'income']]
y = d.default
# random_state is just a seed value for the initial state
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)


In [None]:
# Convert them back into dataframes, for convenience
train = pd.DataFrame(data=X_train, columns=['balance', 'student', 'income'])
train['default'] = y_train
test = pd.DataFrame(data=X_test, columns=['balance', 'student', 'income'])
test['default'] = y_test

In [None]:
# pd.scatter_matrix(train, figsize=(20,20), kind of a waste of time cant see much on this
# 3 - create a histogram of all variables
train.hist();

In [None]:
# 4 - Create a scatter plot of the income vs. balance
train.plot(x='balance', y='income', kind='scatter', alpha=0.1)
plt.ylim([0,80000]); plt.xlim([0, 2800])

In [None]:
# 5 - Mark defaults with a different color and symbol
train_nd = d[d.default == 0]
train_d = d[d.default == 1]

plt.figure()

plt.scatter(train_nd.balance, train_nd.income, alpha = .5, marker='+', c= 'b')
plt.scatter(train_d.balance, train_d.income, marker='o', edgecolors = 'r', facecolors = 'none')
plt.ylim([0,80000]); plt.xlim([0, 2800])
plt.legend( ('no default', 'default'), loc='upper right');

In [None]:
# What can you infer from this plot?
# it appears that the balance is more correlated with default than income
'''
PART II: Training the Model
'''
# 1 - Run a logistic regression on the balance variable
# 2 - Is the beta value associated with balance significant?
balance = LogisticRegression()
balance.fit(train[['balance']], y_train)
B1 = balance.coef_[0][0]
B0 = balance.intercept_[0]
np.exp(B1)

print "Beta1", B1
print "Beta", B0
#beta1 = 0.00429
#Beta0 = -8.6011


In [None]:
# Beta is significant!
# 2 - Predict the probability of default for someone with a balance of $1.2k and $2.5k
prob = balance.predict(pd.DataFrame({'balance': [1200, 2500]})
                       
# What does beta mean? Let's create some plots to find out!
x = np.linspace(test.balance.min(), test.balance.max(),500)
beta = [B0,B1]
                       
y = np.exp(beta[0] + beta[1]*x) / (1 + np.exp(beta[0] + beta[1]*x))
odds = np.exp(beta[0] + beta[1]*x)
log_odds = beta[0] + beta[1]*x
                       
# Plot the probability
plt.figure(figsize=(7, 8))
plt.subplot(311)
plt.plt(x, y, 'r', linewidth=2)
plt.label('Probability')
plt.text(500, 0.7, r'$\frac{e^{\beta_o + \beta_1x}}{1+e^{\beta_o + \beta_1x}}$', fontsize=25)

# Plot the odds
plt.subplot(312)
plt.plot(x, odds, 'k', linewidth=2)
plt.ylabel('Odds')
plt.text(500, 10, r'$e^{\beta_o + \beta_1x}$', fontsize=20)
                    
# Plot the log odds
plt.subplot(313)
plt.plot(x, log_odds, 'c', linewidth=2)
plt.ylabel('Log(Odds)')
plt.xlabel('x')
plt.text(500, 1, r'$\beta_o + \beta_1x$', fontsize=15)

# Let's try plotting some points
plt.subplot(311)
pts = np.array([1200, 2500])
yplts = np.exp(beta[0] + beta[1]*pts) / (1 + np.exp(beta[0] + beta[1]*pts))
plt.plot(pts, ypts, 'ko')
                       
plt.subplot(312)
odds_pts = np.exp(beta[0] + beta[1]*pts)
plt.plot(pts, odds_pts, 'ro')
                       
plt.subplot(313)
log_odds_pts = beta[0] + beta[1]*pts
plt.plot(pts, log_odds_pts, 'ko')