In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.formula.api as smf
from sklearn.metrics import roc_auc_score
from scipy.stats import fisher_exact


We will be using data from a bike sharing company (Capital Bike Share). Each hour, the number of riders (cnt) is given, along with various other attributes as shown in the table below:

# Question 1
Bring in the bike data from the GitHub repository.  In Python, create a train, test split with the test set of 0.3 and a random seed of 58426.  We will be modeling a binary response variable that indicates when casual users are more than or equal to registered users (i.e. create a "1" if casual >= registered, otherwise "0").  After you create this variable, how many 1's are there?

In [2]:
bike = pd.read_csv('hour.csv')
bike['more_cas'] = np.where(bike['casual'] >= bike['registered'], 1, 0)
train, test = train_test_split(bike, test_size=0.3, random_state=58426)

In [11]:
train['more_cas'].value_counts()

more_cas
0    11889
1      276
Name: count, dtype: int64

# Question 2
Using your newly created response variable, create a Logistic regression with season and temp as the predictor variables.  What is the slope coefficient for temp? Round to two decimal places.

In [3]:
log_model = smf.logit("more_cas ~ temp + C(season)", data = train).fit()

log_model.summary()

Optimization terminated successfully.
         Current function value: 0.106252
         Iterations 9


0,1,2,3
Dep. Variable:,more_cas,No. Observations:,12165.0
Model:,Logit,Df Residuals:,12160.0
Method:,MLE,Df Model:,4.0
Date:,"Tue, 22 Jul 2025",Pseudo R-squ.:,0.01913
Time:,09:13:46,Log-Likelihood:,-1292.6
converged:,True,LL-Null:,-1317.8
Covariance Type:,nonrobust,LLR p-value:,2.971e-10

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-3.9420,0.199,-19.816,0.000,-4.332,-3.552
C(season)[T.2],0.6117,0.203,3.007,0.003,0.213,1.010
C(season)[T.3],0.0249,0.272,0.092,0.927,-0.507,0.557
C(season)[T.4],-0.6441,0.229,-2.811,0.005,-1.093,-0.195
temp,0.1653,0.500,0.331,0.741,-0.814,1.145


# Question 3
Calculate the odds ratio for temp. Round to two decimal places.

In [4]:
fisher_exact(pd.crosstab(index = train['more_cas'], columns = train['temp']))

SignificanceResult(statistic=np.float64(1.0875815889870318e-43), pvalue=np.float64(0.0877))

In [7]:
odds_ratio = np.exp(log_model.params)

print(odds_ratio)

Intercept         0.019410
C(season)[T.2]    1.843627
C(season)[T.3]    1.025184
C(season)[T.4]    0.525115
temp              1.179690
dtype: float64


__________

Data were collected in an effort to relate the safety of certain vehicles to different aspects of those vehicles. 

This dataset has the following variables:

· Unsafe: binary safety designation (1 = below average (unsafe), 0 = average or above average (safe))

· Type: type of car (Large, Medium, Small, Sport/Utility, Sports)

· Region: manufacturing region (Asia, N America)

· Weight: integer value for car weight ranging from 1 to 6

· Size: size of car corresponding to Type (1 = Small/Sports, 2 = Medium, 3 = Large or Sport/Utility)

Do not create a training and test set and just use the whole dataset for the following analysis.

# Question 4
Build a logistic regression, predicting Unsafe using the variables Region, Weight, and Size. Treat Weight as a continuous variable. Treat Region and Size as categorical. Which variable(s) are significant in the model?

In [17]:
safety = pd.read_csv('https://raw.githubusercontent.com/IAA-Faculty/statistical_foundations/refs/heads/master/safety.csv')

In [16]:
log_model = smf.logit("Unsafe ~ C(Region) + C(Size) + Weight", data = safety).fit()

log_model.summary()

Optimization terminated successfully.
         Current function value: 0.437523
         Iterations 7


0,1,2,3
Dep. Variable:,Unsafe,No. Observations:,96.0
Model:,Logit,Df Residuals:,91.0
Method:,MLE,Df Model:,4.0
Date:,"Mon, 21 Jul 2025",Pseudo R-squ.:,0.2956
Time:,23:17:05,Log-Likelihood:,-42.002
converged:,True,LL-Null:,-59.624
Covariance Type:,nonrobust,LLR p-value:,4.139e-07

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,2.7285,1.395,1.956,0.050,-0.005,5.462
C(Region)[T.N America],-0.3775,0.562,-0.671,0.502,-1.480,0.725
C(Size)[T.2],-2.0200,0.625,-3.234,0.001,-3.244,-0.796
C(Size)[T.3],-2.6785,0.881,-3.040,0.002,-4.405,-0.952
Weight,-0.6678,0.459,-1.455,0.146,-1.567,0.232


# Question 5
We can estimate concordance by using the Area Under the Curve (AUC). To do this with our model (assuming you called the model lp_model), you would do the following code:

from sklearn.metrics import roc_auc_score

Get predicted probabilities:
pred_probs = lp_model.predict(safety)

Calculate AUC:
auc = roc_auc_score(safety['Unsafe'], pred_probs)
print(f"AUC: {auc:.3f}")

What did you get for the AUC for this model. Keep answer accurate to three decimal places.

In [20]:
pred_probs = log_model.predict(safety)
auc = roc_auc_score(safety['Unsafe'], pred_probs)
print(f"AUC: {auc:.3f}")

AUC: 0.848


# Question 6
Remove variables one at a time that have a p-value above 0.05.  Once we are done, the variables left in the model are Size and Weight.

See R Code