In [9]:
import statsmodels.api as sm
import pandas as pd
from sklearn.decomposition import FactorAnalysis
from statsmodels.api import OLS, add_constant
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [14]:
# Load the dataset from a CSV file
df = pd.read_csv('Merged_Florida_ct.csv')

In [17]:
# Base model with only control variables
X_base = df[['age_median','sex_male_ratio','employment_unemployed_ratio','race_white_ratio','race_black_ratio','race_native_ratio','race_asian_ratio','vehicle_per_household','rent_median','inc_median_household']]
X_base = sm.add_constant(X_base)  # Adds a constant term to the predictors
y = df['travel_driving_ratio']
X_base_filled = X_base.fillna(X_base.mean())
y_filled = y.fillna(y.mean())
model_base = sm.OLS(y_filled, X_base_filled).fit()
base_summary = model_base.summary()
with open('basemodel_summary.txt', 'w') as f:
    f.write(base_summary.as_text())



In [18]:
# Built environment model with part of 3Ds variables, control and built environment variables
X_built_env = df[['age_median','sex_male_ratio','employment_unemployed_ratio','race_white_ratio','race_black_ratio','race_native_ratio','race_asian_ratio','vehicle_per_household','rent_median','inc_median_household',
                  'population_density', '8-tier_employment_entropy', 'auto_intersection_density']]
X_built_env = sm.add_constant(X_built_env)
X_built_env_filled = X_built_env.fillna(X_built_env.mean())
model_built_env = sm.OLS(y_filled, X_built_env_filled).fit()
env_summary = model_built_env.summary()
with open('env_summary.txt', 'w') as f:
    f.write(env_summary.as_text())



In [21]:
# Built environment model with control and built environment variables
X_built_env_all = df[['age_median','sex_male_ratio','employment_unemployed_ratio','race_white_ratio','race_black_ratio','race_native_ratio','race_asian_ratio','vehicle_per_household','rent_median','inc_median_household','population_density', 'employment_density','accessibility_index', 'regional_diversity', '5-tier_employment_entropy', '8-tier_employment_entropy', 'employment_household_entropy', 'trip_equilibrium_index', 'retail_employment_density', 'industrial_employment_density', 'service_employment_density', 'entertainment_employment_density', 'road_network_density',
                 'pedestrian_intersection_density', 'street_intersection_density', 'auto_intersection_density', 'multi-modal_intersection_density','multi-modal_intersection_density(>4legs)','pedestrian_intersection_density(>4legs)']]
X_built_env_all = sm.add_constant(X_built_env_all)
X_built_env_all_filled = X_built_env_all.fillna(X_built_env_all.mean())
model_built_env_all = sm.OLS(y_filled, X_built_env_all_filled).fit()
env_all_summary = model_built_env_all.summary()
with open('env_all_summary.txt', 'w') as f:
    f.write(env_all_summary.as_text())



In [22]:
# Standardize the features
scaler = StandardScaler()
X_std = scaler.fit_transform(df[['age_median','sex_male_ratio','employment_unemployed_ratio','race_white_ratio','race_black_ratio','race_native_ratio','race_asian_ratio','vehicle_per_household','rent_median','inc_median_household','population_density', 'employment_density','accessibility_index', 'regional_diversity', '5-tier_employment_entropy', '8-tier_employment_entropy', 'employment_household_entropy', 'trip_equilibrium_index', 'retail_employment_density', 'industrial_employment_density', 'service_employment_density', 'entertainment_employment_density', 'road_network_density',
                 'pedestrian_intersection_density', 'street_intersection_density', 'auto_intersection_density', 'multi-modal_intersection_density','multi-modal_intersection_density(>4legs)','pedestrian_intersection_density(>4legs)']])


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [25]:
# Define variables for each factor
intensity_vars = ['population_density', 'employment_density', 'retail_employment_density', 'industrial_employment_density', 'service_employment_density', 'entertainment_employment_density']
built_env_vars = ['road_network_density', 'pedestrian_intersection_density', 'street_intersection_density', 'auto_intersection_density', 'multi-modal_intersection_density', 'multi-modal_intersection_density(>4legs)','pedestrian_intersection_density(>4legs)']
control_vars = ['age_median','sex_male_ratio','employment_unemployed_ratio','race_white_ratio','race_black_ratio','race_native_ratio','race_asian_ratio','vehicle_per_household','rent_median','inc_median_household']

In [26]:
# Standardize the features before factor analysis
scaler = StandardScaler()
intensity_scaled = scaler.fit_transform(df[intensity_vars])
built_env_scaled = scaler.fit_transform(df[built_env_vars])
control_scaled = scaler.fit_transform(df[control_vars])

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dty

In [27]:
imputer = SimpleImputer(strategy='mean')
intensity_imputed = imputer.fit_transform(intensity_scaled)
built_env_imputed = imputer.fit_transform(built_env_scaled)
control_imputed = imputer.fit_transform(control_scaled)

In [28]:
# Perform factor analysis to extract one factor for each group
fa_intensity = FactorAnalysis(n_components=1).fit_transform(intensity_imputed)
fa_built_env = FactorAnalysis(n_components=1).fit_transform(built_env_imputed)
fa_control = FactorAnalysis(n_components=1).fit_transform(control_imputed)

In [29]:
# Prepare the DataFrame for regression
factors_df = pd.DataFrame({
    'Intensity_Factor': fa_intensity.flatten(),
    'Built_Env_Factor': fa_built_env.flatten(),
    'Control_Factor': fa_control.flatten()
})

In [30]:
# target variable
y = df['travel_driving_ratio'].fillna(df['travel_driving_ratio'].mean())
# Add a constant term to the predictors
X_factors = add_constant(factors_df)
# Perform multiple linear regression with the extracted factors
model_factors = OLS(y, X_factors).fit()
# Output the model summary to a text file
with open('factor_analysis_summary.txt', 'w') as f:
    f.write(model_factors.summary().as_text())



In [31]:
# Print the coefficients of the factors model
print("Coefficients of the Factors Model:")
print(model_factors.params)
# Optionally, print the summary in the console as well
print(model_factors.summary())

Coefficients of the Factors Model:
const               0.870876
Intensity_Factor   -0.006247
Built_Env_Factor    0.008337
Control_Factor      0.000882
dtype: float64
                             OLS Regression Results                             
Dep. Variable:     travel_driving_ratio   R-squared:                       0.025
Model:                              OLS   Adj. R-squared:                  0.024
Method:                   Least Squares   F-statistic:                     35.71
Date:                  Wed, 21 Feb 2024   Prob (F-statistic):           8.87e-23
Time:                          21:22:54   Log-Likelihood:                 4171.1
No. Observations:                  4167   AIC:                            -8334.
Df Residuals:                      4163   BIC:                            -8309.
Df Model:                             3                                         
Covariance Type:              nonrobust                                         
                       c

In [32]:
# Calculate VIF for built environment model with part of 3Ds variables
vif_data = pd.DataFrame()
vif_data["feature"] = X_built_env_filled.columns
vif_data["VIF"] = [variance_inflation_factor(X_built_env_filled.values, i) for i in range(X_built_env_filled.shape[1])]

print(vif_data)

                        feature          VIF
0                         const  1060.193879
1                    age_median     3.243626
2                sex_male_ratio     1.081120
3   employment_unemployed_ratio     2.679439
4              race_white_ratio    26.891234
5              race_black_ratio    24.120728
6             race_native_ratio     1.062442
7              race_asian_ratio     1.766722
8         vehicle_per_household     1.031262
9                   rent_median     2.382089
10         inc_median_household     2.509961
11           population_density     1.223376
12    8-tier_employment_entropy     1.134062
13    auto_intersection_density     1.355141


In [33]:
# Calculate VIF for built environment model with control and built environment variables
vif_data = pd.DataFrame()
vif_data["feature"] = X_built_env_all_filled.columns
vif_data["VIF"] = [variance_inflation_factor(X_built_env_all_filled.values, i) for i in range(X_built_env_all_filled.shape[1])]

print(vif_data)

                                     feature          VIF
0                                      const  3349.600957
1                                 age_median     3.564672
2                             sex_male_ratio     1.098203
3                employment_unemployed_ratio     2.784767
4                           race_white_ratio    27.720307
5                           race_black_ratio    24.878302
6                          race_native_ratio     1.078261
7                           race_asian_ratio     1.844987
8                      vehicle_per_household     1.038537
9                                rent_median     2.656872
10                      inc_median_household     2.583484
11                        population_density    85.554946
12                        employment_density  5656.413788
13                       accessibility_index    49.770760
14                        regional_diversity     3.498393
15                 5-tier_employment_entropy    32.298529
16            

  vif = 1. / (1. - r_squared_i)
