# Fixed Effect Regression Analysis

In [1]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from linearmodels import PanelOLS
import warnings
warnings.filterwarnings('ignore')

In [2]:
# import data
df = pd.read_csv('../05_clean_data/processed_data_clean_final.csv')
df.head()

Unnamed: 0,date,total_activity,citation_issued,citation_rate,day_of_week,month,days_end_month,end_of_month,year,days_end_year,end_of_year,quarter,days_end_quarter,end_of_quarter,city,state
0,2010-01-01,27,27,1.0,5,1,30,False,2010,180.0,False,3.0,89.0,False,bakersfield,ca
1,2010-01-02,6,6,1.0,6,1,29,False,2010,179.0,False,3.0,88.0,False,bakersfield,ca
2,2010-01-03,13,13,1.0,7,1,28,False,2010,178.0,False,3.0,87.0,False,bakersfield,ca
3,2010-01-04,86,86,1.0,1,1,27,False,2010,177.0,False,3.0,86.0,False,bakersfield,ca
4,2010-01-05,60,60,1.0,2,1,26,False,2010,176.0,False,3.0,85.0,False,bakersfield,ca


In [3]:
# make a list of tuple of city, state for all the unique cities and state in the dataframe
city_state = list(zip(df['city'], df['state']))
# make a list of unique city, state
city_state_unique = list(set(city_state))

In [29]:
# Define a list of end_of_* columns
end_of_columns = ["end_of_month", "end_of_quarter", "end_of_year"]

significant_cities = dict()
pval_thres = 0.05

positive_corr = dict()
negative_corr = dict()

for city, state in city_state_unique:
    # Filter the data by city and state
    df_filtered = df[(df["city"] == city) & (df["state"] == state)]

    temp_dict1 = dict()
    temp_dict2 = dict()

    for end_of_col in end_of_columns:
        # Set the index as a MultiIndex with 'date'
        df_filtered[end_of_col] = df_filtered[end_of_col].astype(
            int
        )  # Convert True/False to 1/0

        # Prepare the dependent and independent variables
        dependent_var = df_filtered["citation_issued"]
        independent_vars = sm.add_constant(df_filtered[[end_of_col]])
        independent_vars["year"] = df_filtered["year"]

        # Create the PanelData structure
        panel_data = df_filtered[
            ["date", "citation_issued", end_of_col, "year"]
        ]
        panel_data = panel_data.set_index(["date", "year"])

        # Run the regression with TimeEffects
        model = PanelOLS.from_formula(
            f"citation_issued ~ {end_of_col} + TimeEffects", data=panel_data
        )
        result = model.fit()
        summ = result.summary

        if result.pvalues[0] < pval_thres:
            temp_dict2["param_value"] = result.params[0]
            temp_dict2["p_value"] = result.pvalues[0]
            temp_dict2["r_squared"] = result.rsquared
            temp_dict1[end_of_col] = temp_dict2
            significant_cities[city] = temp_dict1
            temp_dict2 = dict()
            if result.params[0] > 0:
                positive_corr[f"{city}_{end_of_col}"] = summ
                print(
                f"{city} city at state {state} has a positive value for "\
                f"{end_of_col} period and citation_issued. The value "\
                f"is: {result.params[0]:.5f}, with p-value: "\
                    f"{result.pvalues[0]:.5f}"
            )
            elif result.params[0] < 0:
                negative_corr[f"{city}_{end_of_col}"] = result

durham city at state nc has a positive value for end_of_year period and citation_issued. The value is: 3.85916, with p-values: 0.00198
madison city at state wi has a positive value for end_of_year period and citation_issued. The value is: 5.53688, with p-values: 0.02738
cincinnati city at state oh has a positive value for end_of_month period and citation_issued. The value is: 3.62864, with p-values: 0.00008
aurora city at state co has a positive value for end_of_year period and citation_issued. The value is: 17.22527, with p-values: 0.00123


In [52]:
# These are the cities with significant p-values and positive correlation
for city in positive_corr.keys():
    print(city)

durham_end_of_year
madison_end_of_year
cincinnati_end_of_month
aurora_end_of_year


> Each regression summary is accessible as value in the dictionary, so for example just call `positive_corr['cincinnati_end_of_month']` to get the regression summary for the Cincinnati end of month data.

In [51]:
# These are the cities with significant p-values and negative correlation
for city in negative_corr.keys():
    print(city)

winston-salem_end_of_month
winston-salem_end_of_quarter
houston_end_of_quarter
bakersfield_end_of_quarter
bakersfield_end_of_year
sanfrancisco_end_of_month
sanfrancisco_end_of_quarter
sanfrancisco_end_of_year
durham_end_of_month
durham_end_of_quarter
baltimore_end_of_month
baltimore_end_of_quarter
baltimore_end_of_year
sanantonio_end_of_month
sanantonio_end_of_quarter
greensboro_end_of_quarter
denver_end_of_quarter
oklahomacity_end_of_quarter
madison_end_of_quarter
seattle_end_of_quarter
seattle_end_of_year
cincinnati_end_of_quarter
aurora_end_of_quarter


> Each regression summary is accessible as value in the dictionary, so for example just call `negative_corr['cincinnati_durham_end_of_month']` to get the regression summary for the Cincinnati end of month data.

In [53]:
# Combination of both
for city in significant_cities.keys():
    print(city)

winston-salem
houston
bakersfield
sanfrancisco
durham
baltimore
sanantonio
greensboro
denver
oklahomacity
madison
seattle
cincinnati
aurora


# I didn't change anything below this line
---

# Example of interpretation
## Cincinnati

In [4]:
df['date'] = pd.to_datetime(df['date'])

# Filter the data by city
city_name = 'cincinnati'
df_filtered = df[df['city'] == city_name]

## End of Month

In [5]:
# Set the index as a MultiIndex with 'date'
df_filtered['end_of_month'] = df_filtered['end_of_month'].astype(int)  # Convert True/False to 1/0

# Prepare the dependent and independent variables
dependent_var = df_filtered['citation_issued']
independent_vars = sm.add_constant(df_filtered[['end_of_month']])
independent_vars['year'] = df_filtered['year']

# Create the PanelData structure
panel_data = df_filtered[['date','citation_issued', 'end_of_month', 'year']]
panel_data = panel_data.set_index(['date', 'year'])
panel_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,citation_issued,end_of_month
date,year,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-01-01,2009,70,0
2009-01-02,2009,116,0
2009-01-03,2009,118,0
2009-01-04,2009,56,0
2009-01-05,2009,115,0


In [6]:
# Run the regression with TimeEffects
model = PanelOLS.from_formula("citation_issued ~ end_of_month + TimeEffects", data=panel_data)
result = model.fit()

print(result)

                          PanelOLS Estimation Summary                           
Dep. Variable:        citation_issued   R-squared:                        0.0050
Estimator:                   PanelOLS   R-squared (Between):              0.0216
No. Observations:                3428   R-squared (Within):               0.0000
Date:                Wed, Apr 26 2023   R-squared (Overall):              0.0216
Time:                        08:51:01   Log-likelihood                -1.526e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      17.131
Entities:                        3428   P-value                           0.0000
Avg Obs:                       1.0000   Distribution:                  F(1,3417)
Min Obs:                       1.0000                                           
Max Obs:                       1.0000   F-statistic (robust):             17.131
                            

In [11]:
result.params[0]

3.6955627335593326

The parameter estimate of 3.6956 indicates that, on average, there is an increase of 3.6956 `citation_issued` when it is the end of the month, holding all other variables constant. The p-value of 0.0000 suggests that this effect is statistically significant at 0.05 significance level meaning that there is strong evidence to conclude that there is a true positive relationship between `end_of_month` and `citation_issued`.

In summary, the results suggest that there is a statistically significant increase in the number of citations issued at the end of the month compared to other times, after accounting for time fixed effects.

## End of Quarter

In [87]:
# Set the index as a MultiIndex with 'date'
df_filtered['end_of_quarter'] = df_filtered['end_of_quarter'].astype(int)  # Convert True/False to 1/0

# Prepare the dependent and independent variables
dependent_var = df_filtered['citation_issued']
independent_vars = sm.add_constant(df_filtered[['end_of_quarter']])
independent_vars['year'] = df_filtered['year']

# Create the PanelData structure
panel_data = df_filtered[['date','citation_issued', 'end_of_quarter', 'year']]
panel_data = panel_data.set_index(['date', 'year'])
panel_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,citation_issued,end_of_quarter
date,year,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-01-01,2009,70,0
2009-01-02,2009,116,0
2009-01-03,2009,118,0
2009-01-04,2009,56,0
2009-01-05,2009,115,0


In [88]:
# Run the regression with TimeEffects
model = PanelOLS.from_formula("citation_issued ~ end_of_quarter + TimeEffects", data=panel_data)
result = model.fit()

print(result)

                          PanelOLS Estimation Summary                           
Dep. Variable:        citation_issued   R-squared:                        0.0094
Estimator:                   PanelOLS   R-squared (Between):             -0.0206
No. Observations:                3428   R-squared (Within):               0.0000
Date:                Tue, Apr 25 2023   R-squared (Overall):             -0.0206
Time:                        23:04:03   Log-likelihood                -1.525e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      32.388
Entities:                        3428   P-value                           0.0000
Avg Obs:                       1.0000   Distribution:                  F(1,3417)
Min Obs:                       1.0000                                           
Max Obs:                       1.0000   F-statistic (robust):             32.388
                            

The parameter estimate of -6.2315 indicates that, on average, there is a decrease of 6.2315 `citation_issued` when it is the end of the quarter, holding all other variables constant. The p-value of 0.0000 suggests that this effect is statistically significant at 0.05 significance level meaning that there is strong evidence to conclude that there is a true relationship between `end_of_quarter` and `citation_issued`.

In summary, the results suggest that there is a statistically significant decrease in the number of citations issued at the end of the quarter compared to other times, after accounting for time fixed effects.

## End of Year

In [89]:
# Set the index as a MultiIndex with 'date'
df_filtered['end_of_year'] = df_filtered['end_of_year'].astype(int)  # Convert True/False to 1/0

# Prepare the dependent and independent variables
dependent_var = df_filtered['citation_issued']
independent_vars = sm.add_constant(df_filtered[['end_of_year']])
independent_vars['year'] = df_filtered['year']

# Create the PanelData structure
panel_data = df_filtered[['date','citation_issued', 'end_of_year', 'year']]
panel_data = panel_data.set_index(['date', 'year'])
panel_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,citation_issued,end_of_year
date,year,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-01-01,2009,70,0
2009-01-02,2009,116,0
2009-01-03,2009,118,0
2009-01-04,2009,56,0
2009-01-05,2009,115,0


In [90]:
# Run the regression with TimeEffects
model = PanelOLS.from_formula("citation_issued ~ end_of_year + TimeEffects", data=panel_data)
result = model.fit()

print(result)

                          PanelOLS Estimation Summary                           
Dep. Variable:        citation_issued   R-squared:                     3.637e-05
Estimator:                   PanelOLS   R-squared (Between):             -0.0008
No. Observations:                3428   R-squared (Within):               0.0000
Date:                Tue, Apr 25 2023   R-squared (Overall):             -0.0008
Time:                        23:04:03   Log-likelihood                -1.527e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      0.1243
Entities:                        3428   P-value                           0.7245
Avg Obs:                       1.0000   Distribution:                  F(1,3417)
Min Obs:                       1.0000                                           
Max Obs:                       1.0000   F-statistic (robust):             0.1243
                            

The parameter estimate of -0.6257 indicates that, on average, there is a decrease of 0.6257 `citation_issued` when it is the end of the year, holding all other variables constant. The p-value of 0.7245 suggests that this effect is not statistically significant at 0.05 significance level meaning that there is insuficcient evidence to conclude that there is a true positive relationship between `end_of_year` and `citation_issued`.
