# Lec08 - Logistic regression (revisited)

1.   Benchmark logistic regression
2.   Logistic regression in ML



In [1]:
# import modules
import pandas as pd
import numpy as np
import statsmodels.api as sm

# new module sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics



In [2]:
# read data
df = pd.read_csv('SampleDataset/Florida_ct.csv', index_col = 0)

In [3]:
df.head()

Unnamed: 0,pop_total,sex_total,sex_male,sex_female,age_median,households,race_total,race_white,race_black,race_native,...,travel_walk_ratio,travel_work_home_ratio,edu_bachelor_ratio,edu_master_ratio,edu_phd_ratio,edu_higher_edu_ratio,employment_unemployed_ratio,vehicle_per_capita,vehicle_per_household,vacancy_ratio
0,2812.0,2812.0,1383.0,1429.0,39.4,931.0,2812.0,2086.0,517.0,0.0,...,0.014815,0.024242,0.183838,0.029798,0.00303,0.216667,0.286635,0.528094,1.595059,0.155938
1,4709.0,4709.0,2272.0,2437.0,34.2,1668.0,4709.0,2382.0,1953.0,0.0,...,0.02215,0.004615,0.135222,0.040245,0.00322,0.178686,0.318327,0.460183,1.299161,0.152869
2,5005.0,5005.0,2444.0,2561.0,34.1,1379.0,5005.0,2334.0,2206.0,224.0,...,0.026141,0.027913,0.213247,0.06462,0.007431,0.285299,0.366755,0.450949,1.636693,0.162211
3,6754.0,6754.0,2934.0,3820.0,31.3,2238.0,6754.0,4052.0,1671.0,326.0,...,0.052697,0.004054,0.093379,0.08251,0.012599,0.188488,0.314452,0.47483,1.432976,0.178716
4,3021.0,3021.0,1695.0,1326.0,44.1,1364.0,3021.0,2861.0,121.0,0.0,...,0.003014,0.013059,0.219868,0.138631,0.007064,0.365563,0.218447,0.659053,1.459677,0.33593


In [4]:
# preprocessing
# expensive vs. non-expensive properties as the binary variable
# threshold = 0.8
df['property_value_discrete'] = 1
df.loc[df['property_value_median'] < 200000, 'property_value_discrete'] = 0


## Section 1. Creating a baseline logistic regression (stat paradigm)

In [5]:
# Regression 3. Enriching the logistic regression.
var_list = ['inc_median_household', 
            'households',
            'travel_driving_ratio', 'travel_pt_ratio', 'travel_taxi_ratio', 'travel_work_home_ratio',
            'edu_higher_edu_ratio',
            'household_size_avg', 
            'vacancy_ratio', 'rent_median', 
            'race_white_ratio', 
            'race_asian_ratio'
            ]

y = df['property_value_discrete']
X = df[var_list]
X = sm.add_constant(X)
model = sm.Logit(y, X)
results = model.fit()
print(results.summary())

Optimization terminated successfully.
         Current function value: 0.342854
         Iterations 12
                              Logit Regression Results                             
Dep. Variable:     property_value_discrete   No. Observations:                 4167
Model:                               Logit   Df Residuals:                     4154
Method:                                MLE   Df Model:                           12
Date:                     Tue, 28 Nov 2023   Pseudo R-squ.:                  0.5052
Time:                             16:54:28   Log-Likelihood:                -1428.7
converged:                            True   LL-Null:                       -2887.1
Covariance Type:                 nonrobust   LLR p-value:                     0.000
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                     -7.0535      1.19

## Section 2. Creating a logistic regression (ML paradigm)

In [6]:
# assign the inputs and outputs
var_list = ['inc_median_household', 
            'households',
            'travel_driving_ratio', 'travel_pt_ratio', 'travel_taxi_ratio', 'travel_work_home_ratio',
            'edu_higher_edu_ratio',
            'household_size_avg', 
            'vacancy_ratio', 'rent_median', 
            'race_white_ratio', 
            'race_asian_ratio'
            ] 

y = df['property_value_discrete']
X = df[var_list]
X = sm.add_constant(X)

# change the data format
X = X.values
y = y.values


In [7]:
X

array([[1.00000000e+00, 5.35330000e+04, 9.31000000e+02, ...,
        1.59200000e+03, 7.41820768e-01, 4.33854908e-02],
       [1.00000000e+00, 3.39580000e+04, 1.66800000e+03, ...,
        1.10900000e+03, 5.05839881e-01, 1.86876195e-02],
       [1.00000000e+00, 4.02500000e+04, 1.37900000e+03, ...,
        1.29100000e+03, 4.66333666e-01, 3.17682318e-02],
       ...,
       [1.00000000e+00, 6.57860000e+04, 3.74600000e+03, ...,
        1.10500000e+03, 7.69290273e-01, 2.41732740e-02],
       [1.00000000e+00, 5.92360000e+04, 3.32400000e+03, ...,
        1.06100000e+03, 7.01562500e-01, 4.62053571e-02],
       [1.00000000e+00, 4.68750000e+04, 1.75500000e+03, ...,
        8.99000000e+02, 7.38343498e-01, 1.96734212e-03]])

In [8]:
# creating the training and testing split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=16)

In [9]:
# view the four data sets
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3333, 13)
(834, 13)
(3333,)
(834,)


In [10]:
# initialize logistic regression
# instantiate the model (using the default parameters)
logreg = LogisticRegression(random_state=16)

# fit the model with training data only
logreg.fit(X_train, y_train)


In [11]:
# check the parameters.
logreg.coef_

array([[-5.02143963e-02,  6.56997126e-05, -2.43955211e-04,
        -4.79921674e-02, -9.28007051e-04, -2.92417212e-05,
         8.72517230e-04,  8.27797943e-03, -1.57951934e+00,
        -8.54350861e-03,  9.72967022e-04, -2.55249333e-02,
         8.73864354e-05]])

In [12]:
# check the training error using the confusion matrix
y_pred = logreg.predict(X_train)
cnf_matrix = metrics.confusion_matrix(y_train, y_pred)

prediction_error_training = (cnf_matrix[0,1]+cnf_matrix[1,0])/cnf_matrix.sum()
print("Training error is: ", prediction_error_training)

Training error is:  0.21182118211821183


In [13]:
# confusion matrix in the training set
cnf_matrix

array([[1355,  351],
       [ 355, 1272]])

In [14]:
# check the testing error using the confusion matrix
y_pred = logreg.predict(X_test)
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)

prediction_error_testing = (cnf_matrix[0,1]+cnf_matrix[1,0])/cnf_matrix.sum()
print("Testing error is: ", prediction_error_testing)

Testing error is:  0.20983213429256595


In [15]:
# confusion matrix in the testing set
cnf_matrix

array([[346,  82],
       [ 93, 313]])

You could see that the training and testing errors are not very far-off because the model complexity is quite limited (i.e., only 13 features and 13 parameters). We shall remember this result, since an overly complex model will have very low training errors but high testing errors. We will discuss an example next time.

##**Exercise.** Create a dummy variable to represent the auto vs. non-auto census tracts by using a threshold value. Then use sklearn to run logistic regression and compute the training and testing errors

In [16]:
df.columns

Index(['pop_total', 'sex_total', 'sex_male', 'sex_female', 'age_median',
       'households', 'race_total', 'race_white', 'race_black', 'race_native',
       'race_asian', 'inc_total_pop', 'inc_no_pop', 'inc_with_pop',
       'inc_pop_10k', 'inc_pop_1k_15k', 'inc_pop_15k_25k', 'inc_pop_25k_35k',
       'inc_pop_35k_50k', 'inc_pop_50k_65k', 'inc_pop_65k_75k', 'inc_pop_75k',
       'inc_median_ind', 'travel_total_to_work', 'travel_driving_to_work',
       'travel_pt_to_work', 'travel_taxi_to_work', 'travel_cycle_to_work',
       'travel_walk_to_work', 'travel_work_from_home', 'edu_total_pop',
       'bachelor_male_25_34', 'master_phd_male_25_34', 'bachelor_male_35_44',
       'master_phd_male_35_44', 'bachelor_male_45_64', 'master_phd_male_45_64',
       'bachelor_male_65_over', 'master_phd_male_65_over',
       'bachelor_female_25_34', 'master_phd_female_25_34',
       'bachelor_female_35_44', 'master_phd_female_35_44',
       'bachelor_female_45_64', 'master_phd_female_45_64',
       '