# Project 3

In this project, you will perform a logistic regression on the admissions data we've been working with in projects 1 and 2.

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
import pylab as pl
import numpy as np


In [2]:
df_raw = pd.read_csv("../assets/admissions.csv")
df = df_raw.dropna() 
print df.head()

df.columns

   admit    gre   gpa  prestige
0      0  380.0  3.61       3.0
1      1  660.0  3.67       3.0
2      1  800.0  4.00       1.0
3      1  640.0  3.19       4.0
4      0  520.0  2.93       4.0


Index([u'admit', u'gre', u'gpa', u'prestige'], dtype='object')

## Part 1. Frequency Tables

#### 1. Let's create a frequency table of our variables

In [3]:
# frequency table for prestige and whether or not someone was admitted
admitprestige = pd.crosstab(index=df["admit"],columns=df["prestige"],margins=True)

admitprestige.columns = ["prestige1","prestige2","prestige3","prestige 4", "rowtotal"]
admitprestige.index = ["denied", "admit", "coltotal"]

admitprestige

Unnamed: 0,prestige1,prestige2,prestige3,prestige 4,rowtotal
denied,28,95,93,55,271
admit,33,53,28,12,126
coltotal,61,148,121,67,397


## Part 2. Return of dummy variables

#### 2.1 Create class or dummy variables for prestige 

In [4]:
df.columns

Index([u'admit', u'gre', u'gpa', u'prestige'], dtype='object')

In [5]:
prestige = pd.get_dummies(df.prestige)
prestige.columns = ['prestige_' + str(i) for i in prestige.columns]

df = pd.concat([df,prestige], axis=1)

df.columns

Index([u'admit', u'gre', u'gpa', u'prestige', u'prestige_1.0', u'prestige_2.0',
       u'prestige_3.0', u'prestige_4.0'],
      dtype='object')

#### 2.2 When modeling our class variables, how many do we need? 



Answer: 3. There are 4 possible values for "prestige" so given 3 of them, we can figure out the fourth.

## Part 3. Hand calculating odds ratios

Develop your intuition about expected outcomes by hand calculating odds ratios.

In [6]:
cols_to_keep = ['admit', 'gre', 'gpa','prestige_1.0','prestige_2.0','prestige_3.0']
handCalc = df[cols_to_keep]
print handCalc.head()

   admit    gre   gpa  prestige_1.0  prestige_2.0  prestige_3.0
0      0  380.0  3.61           0.0           0.0           1.0
1      1  660.0  3.67           0.0           0.0           1.0
2      1  800.0  4.00           1.0           0.0           0.0
3      1  640.0  3.19           0.0           0.0           0.0
4      0  520.0  2.93           0.0           0.0           0.0


In [7]:
#crosstab prestige 1 admission 
# frequency table cutting prestige and whether or not someone was admitted
prestige1_admit = pd.crosstab(index=df["admit"],columns=df["prestige_1.0"])

prestige1_admit.columns = ["no","yes"]
prestige1_admit.index = ["denied", "admit"]

prestige1_admit

Unnamed: 0,no,yes
denied,243,28
admit,93,33


#### 3.1 Use the cross tab above to calculate the odds of being admitted to grad school if you attended a #1 ranked college

In [8]:
33./(33+93)

0.2619047619047619

#### 3.2 Now calculate the odds of admission if you did not attend a #1 ranked college

In [9]:
28./(243+28)

0.1033210332103321

#### 3.3 Calculate the odds ratio

In [10]:
.261904/.1033210

2.5348573862041603

In [11]:
(33./(33+93))/(28./(243+28))

2.5348639455782314

#### 3.4 Write this finding in a sentence: 

Answer: The odds of being admitted to grad school given you attended a #1 ranked college are ~2.53 times better than had you not attended a #1 ranked college.

#### 3.5 Print the cross tab for prestige_4

In [12]:
prestige4_admit = pd.crosstab(index=df["admit"],columns=df["prestige_4.0"])

prestige4_admit.columns = ["no","yes"]
prestige4_admit.index = ["denied", "admit"]

prestige4_admit

Unnamed: 0,no,yes
denied,216,55
admit,114,12


#### 3.6 Calculate the OR 

In [13]:
(12./(12+114))/(55./(55+216))

0.4692640692640692

#### 3.7 Write this finding in a sentence

Answer: The odds of being admitted to grad school given you attended a #4 ranked college are ~0.469 as good as if you had attended a higher ranked college.

## Part 4. Analysis

In [14]:
# create a clean data frame for the regression
cols_to_drop = ['prestige','prestige_4.0']
data = df.drop(cols_to_drop, axis=1)
print data.head()

   admit    gre   gpa  prestige_1.0  prestige_2.0  prestige_3.0
0      0  380.0  3.61           0.0           0.0           1.0
1      1  660.0  3.67           0.0           0.0           1.0
2      1  800.0  4.00           1.0           0.0           0.0
3      1  640.0  3.19           0.0           0.0           0.0
4      0  520.0  2.93           0.0           0.0           0.0


We're going to add a constant term for our Logistic Regression. The statsmodels function we're going to be using requires that intercepts/constants are specified explicitly.

In [15]:
# manually add the intercept
data['intercept'] = 1.0

#### 4.1 Set the covariates to a variable called train_cols

In [16]:
train_cols = ['gre','gpa','prestige_1.0','prestige_2.0','prestige_3.0']

#### 4.2 Fit the model

In [17]:
from sklearn import feature_selection
from sklearn.linear_model import LogisticRegression


In [18]:
from sklearn.cross_validation import train_test_split


X = data[train_cols]
y = data['admit']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [19]:
train = pd.DataFrame(data=X_train, columns=[train_cols])
train['admit'] = y_train

test = pd.DataFrame(data=X_test, columns=[train_cols])
test['admit'] = y_test

In [20]:
mymodel = LogisticRegression()

mymodel.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

#### 4.3 Print the summary results

In [21]:
B5 = mymodel.coef_[0][4]
B4 = mymodel.coef_[0][3]
B3 = mymodel.coef_[0][2]
B2 = mymodel.coef_[0][1]
B1 = mymodel.coef_[0][0]
B0 = mymodel.intercept_[0]

print "Beta0: ", B0
print "Beta1: ", B1
print "Beta2: ", B2
print "Beta3: ", B3
print "Beta4: ", B4
print "Beta5: ", B5

Beta0:  -2.07310954923
Beta1:  0.00240863007708
Beta2:  -0.13599629078
Beta3:  1.26125949548
Beta4:  0.780165780241
Beta5:  -0.0569114185254


#### 4.4 Calculate the odds ratios of the coeffiencents and their 95% CI intervals

hint 1: np.exp(X)

hint 2: conf['OR'] = params
        
           conf.columns = ['2.5%', '97.5%', 'OR']

In [22]:
print "Odds Ratio for B1 = ", np.exp(B1)
print "Odds Ratio for B2 = ", np.exp(B2)
print "Odds Ratio for B3 = ", np.exp(B3)
print "Odds Ratio for B4 = ", np.exp(B4)
print "Odds Ratio for B5 = ", np.exp(B5)

Odds Ratio for B1 =  1.00241153316
Odds Ratio for B2 =  0.87284587006
Odds Ratio for B3 =  3.52986453721
Odds Ratio for B4 =  2.18183394047
Odds Ratio for B5 =  0.944677746608


In [34]:
# not sure how to do the confidence intervals
params = mymodel.params
conf = mymodel.conf_int()
conf['OR'] = params
conf.columns = ['2.5%', '97.5%', 'OR']
print np.exp(conf)

NameError: name 'params' is not defined

#### 4.5 Interpret the OR of Prestige_2

In [23]:
prestige2 = LogisticRegression()

prestige2.fit(train[['prestige_2.0']], y_train)

B1 = prestige2.coef_[0][0] 
B0 = prestige2.intercept_[0]

print "B1: ", B1
print "B0: ", B0

B1:  0.544864786926
B0:  -0.85528741373


In [24]:
print "Odds Ratio of Prestige_2 = ",np.exp(B1)

Odds Ratio of Prestige_2 =  1.72437520854


#### 4.6 Interpret the OR of GPA

In [25]:
gpa = LogisticRegression()

gpa.fit(train[['gpa']], y_train)

B1 = gpa.coef_[0][0] 
B0 = gpa.intercept_[0]

print "B1: ", B1
print "B0: ", B0

B1:  0.293333135356
B0:  -1.63845029498


In [26]:
print "Odds Ratio of GPA = ",np.exp(B1)

Odds Ratio of GPA =  1.34088941371


## Part 5: Predicted probablities


As a way of evaluating our classifier, we're going to recreate the dataset with every logical combination of input values. This will allow us to see how the predicted probability of admission increases/decreases across different variables. First we're going to generate the combinations using a helper function called cartesian (above).

We're going to use np.linspace to create a range of values for "gre" and "gpa". This creates a range of linearly spaced values from a specified min and maximum value--in our case just the min/max observed values.

In [27]:
def cartesian(arrays, out=None):
    """
    Generate a cartesian product of input arrays.
    Parameters
    ----------
    arrays : list of array-like
        1-D arrays to form the cartesian product of.
    out : ndarray
        Array to place the cartesian product in.
    Returns
    -------
    out : ndarray
        2-D array of shape (M, len(arrays)) containing cartesian products
        formed of input arrays.
    Examples
    --------
    >>> cartesian(([1, 2, 3], [4, 5], [6, 7]))
    array([[1, 4, 6],
           [1, 4, 7],
           [1, 5, 6],
           [1, 5, 7],
           [2, 4, 6],
           [2, 4, 7],
           [2, 5, 6],
           [2, 5, 7],
           [3, 4, 6],
           [3, 4, 7],
           [3, 5, 6],
           [3, 5, 7]])
    """

    arrays = [np.asarray(x) for x in arrays]
    dtype = arrays[0].dtype

    n = np.prod([x.size for x in arrays])
    if out is None:
        out = np.zeros([n, len(arrays)], dtype=dtype)

    m = n / arrays[0].size
    out[:,0] = np.repeat(arrays[0], m)
    if arrays[1:]:
        cartesian(arrays[1:], out=out[0:m,1:])
        for j in xrange(1, arrays[0].size):
            out[j*m:(j+1)*m,1:] = out[0:m,1:]
    return out

In [28]:
# instead of generating all possible values of GRE and GPA, we're going
# to use an evenly spaced range of 10 values from the min to the max 
gres = np.linspace(data['gre'].min(), data['gre'].max(), 10)
print gres
# array([ 220.        ,  284.44444444,  348.88888889,  413.33333333,
#         477.77777778,  542.22222222,  606.66666667,  671.11111111,
#         735.55555556,  800.        ])
gpas = np.linspace(data['gpa'].min(), data['gpa'].max(), 10)
print gpas
# array([ 2.26      ,  2.45333333,  2.64666667,  2.84      ,  3.03333333,
#         3.22666667,  3.42      ,  3.61333333,  3.80666667,  4.        ])


# enumerate all possibilities
combos = pd.DataFrame(cartesian([gres, gpas, [1, 2, 3, 4], [1.]]))

[ 220.          284.44444444  348.88888889  413.33333333  477.77777778
  542.22222222  606.66666667  671.11111111  735.55555556  800.        ]
[ 2.26        2.45333333  2.64666667  2.84        3.03333333  3.22666667
  3.42        3.61333333  3.80666667  4.        ]


#### 5.1 Recreate the dummy variables

In [29]:
# recreate the dummy variables

# keep only what we need for making predictions

pr1 = np.linspace(data['prestige_1.0'].min(), data['prestige_1.0'].max(), 10)
print pr1

pr2 = np.linspace(data['prestige_2.0'].min(), data['prestige_2.0'].max(), 10)
print pr1

pr3 = np.linspace(data['prestige_3.0'].min(), data['prestige_3.0'].max(), 10)
print pr1


# enumerate all possibilities
combos = pd.DataFrame(cartesian([pr1, pr2, pr3, [1, 2, 3, 4], [1.]]))

[ 0.          0.11111111  0.22222222  0.33333333  0.44444444  0.55555556
  0.66666667  0.77777778  0.88888889  1.        ]
[ 0.          0.11111111  0.22222222  0.33333333  0.44444444  0.55555556
  0.66666667  0.77777778  0.88888889  1.        ]
[ 0.          0.11111111  0.22222222  0.33333333  0.44444444  0.55555556
  0.66666667  0.77777778  0.88888889  1.        ]


#### 5.2 Make predictions on the enumerated dataset

In [30]:
combos = LogisticRegression()

combos.fit(train[['prestige_2.0']], y_train)

B1 = combos.coef_[0][0] 
B0 = combos.intercept_[0]

print "B1: ", B1
print "B0: ", B0

B1:  0.544864786926
B0:  -0.85528741373


#### 5.3 Interpret findings for the last 4 observations

Answer: It looks like the above example is basically treating each variable as a categorical variable and is creating dummies for each of them.

## Bonus

Plot the probability of being admitted into graduate school, stratified by GPA and GRE score.