# Multivariate Regressions.

*   Section 0. Loading data with preprocessing.
*   Section 1. Univariate regression.
*   Section 2. Multivariate regression.

## Section 0. Loading the data

In [None]:
# import packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm

In [None]:
# define the mounting point on Google drive
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
# Switch to Colab Notebooks.
# Mac system
# !cd '/content/drive/My Drive/Colab Notebooks/data/'
# Windows system
%cd /content/drive/My Drive/Colab Notebooks/data/

In [None]:
# load the csv file using pandas
df = pd.read_csv('Florida_ct.csv', index_col = 0)

In [None]:
df

In [None]:
# Preprocessing.
# 1. creating an education group column with two values
df['edu_dummy'] = 1 # assigning the high education group
df.loc[df['edu_higher_edu_ratio'] < 0.3, 'edu_dummy'] = 0 # assigning the low education group

## Section 1. Univariate regression (recap)

In [None]:
# choose the independent variable
X = df['inc_median_household']
# add a constant to the independent variable
X = sm.add_constant(X)
# choose the dependent var
y = df['property_value_median']

# fitting the model.
# launch the model using the independent and dependent variables
model = sm.OLS(y, X)
# fitting the model
results = model.fit()

# evaluate the model
print("R2: ", results.rsquared)

# report the results.
print(results.summary())

In [None]:
# Comparing the predicted and true values
# plot the fitted line
property_value_predicted = results.predict()

# another way to plot
fig, ax = plt.subplots(figsize = (8, 8))
ax.scatter(df['inc_median_household'], df['property_value_median'], color = 'grey', alpha = 0.6)
ax.plot(df['inc_median_household'], property_value_predicted, color = 'red', lw = 4)
ax.set_xlabel('Income', fontsize = 15)
ax.set_ylabel('Property Values', fontsize = 15)
ax.set_title('Linear regression for property values and income', fontsize = 15)
plt.show()

## Section 2. Multivariable regression

**Baseline Regression 1**

In [None]:
# First baseline regression (univariate regression)
# choose the independent variable
X = df['inc_median_household']
# add a constant to the independent variable
X = sm.add_constant(X)
# choose the dependent var
y = df['property_value_median']

# fitting the model.
# launch the model using the independent and dependent variables
model1 = sm.OLS(y, X)
# fitting the model
results1 = model1.fit()

# report the results.
print(results1.summary())

**Regression with two independent variables**

In [None]:
# choose the independent variable
var_list = ['inc_median_household', 'households']
X = df[var_list]
# add a constant to the independent variable
X = sm.add_constant(X)
# choose the dependent var
y = df['property_value_median']

# fitting the model.
# launch the model using the independent and dependent variables
model2 = sm.OLS(y, X)
# fitting the model
results2 = model2.fit()

# report the results.
print(results2.summary())

**Regression with multiple independent variables**

In [None]:
# create the third regression (multivariate regression)
# choose the independent variable
var_list = ['inc_median_household',
            'households', #'pop_total',
            'travel_driving_ratio', 'travel_pt_ratio', 'travel_taxi_ratio', 'travel_work_home_ratio',
            'edu_higher_edu_ratio', 'edu_dummy',
            'household_size_avg',
            'vacancy_ratio', 'rent_median',
            'race_white_ratio', #'race_black_ratio',
            'race_asian_ratio'
            ]
X = df[var_list]
# add a constant to the independent variable
X = sm.add_constant(X)
# choose the dependent var
y = df['property_value_median']

# fitting the model.
# launch the model using the independent and dependent variables
model3 = sm.OLS(y, X)
# fitting the model
results3 = model3.fit()


# report the results.
print(results3.summary())

**Regression with enriched income variable**

In [None]:
# create the fourth regression (multivariate regression)
# enrich the main variable of interest.
# 1. quadratic term
df['inc_median_household_squared'] = df['inc_median_household'] **2 # high income

# 2. discrete
df['inc_median_household_discrete'] = 1 # high income
df.loc[df['inc_median_household'] < 50000,'inc_median_household_discrete'] = 0 # low income

# adding the quadratic term.
var_list = ['inc_median_household', 'inc_median_household_squared', #'inc_median_household_discrete',
            'households', #'pop_total',
            'travel_driving_ratio', 'travel_pt_ratio', 'travel_taxi_ratio', 'travel_work_home_ratio',
            'edu_higher_edu_ratio', 'edu_dummy',
            'household_size_avg',
            'vacancy_ratio', 'rent_median',
            'race_white_ratio', #'race_black_ratio',
            'race_asian_ratio'
            ]
X = df[var_list]
# add a constant to the independent variable
X = sm.add_constant(X)
# choose the dependent var
y = df['property_value_median']

# fitting the model.
# launch the model using the independent and dependent variables
model4 = sm.OLS(y, X)
# fitting the model
results4 = model4.fit()

# report the results.
print(results4.summary())


### **Exercise.** Use the automobile usage as the dependent variable. Then conduct several multivariable regressions and choose the final model for interpretation.

In [None]:
df.columns