# Fixed Effect Regression Analysis

In [81]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
from linearmodels import PanelOLS
import warnings
warnings.filterwarnings('ignore')

In [82]:
# import data
df = pd.read_csv('/Users/jennyshen/Desktop/processed_data_revised.csv')
df.head()

Unnamed: 0,date,total_activity,citation_issued,citation_rate,day_of_week,month,days_end_month,end_of_month,year,days_end_year,end_of_year,quarter,days_end_quarter,end_of_quarter,city,state
0,2010-12-31,122,13,0.106557,5,12,0,True,2010,181.0,False,2.0,0.0,True,denver,co
1,2011-01-01,176,23,0.130682,6,1,30,False,2011,180.0,False,3.0,89.0,False,denver,co
2,2011-01-02,214,24,0.11215,7,1,29,False,2011,179.0,False,3.0,88.0,False,denver,co
3,2011-01-03,250,32,0.128,1,1,28,False,2011,178.0,False,3.0,87.0,False,denver,co
4,2011-01-04,296,44,0.148649,2,1,27,False,2011,177.0,False,3.0,86.0,False,denver,co


In [83]:
df['date'] = pd.to_datetime(df['date'])

# Define a list of city names
city_names = [('durham', 'nc'), ('madison', 'wi'), ('seattle', 'wa'), ('cincinnati', 'oh')]

# Define a list of end_of_* columns
end_of_columns = ['end_of_month', 'end_of_quarter', 'end_of_year']

for city, state in city_names:
    print("******************************************************************************")
    print(f"\nCity: {city}, State: {state}")

    # Filter the data by city and state
    df_filtered = df[(df['city'] == city) & (df['state'] == state)]

    for end_of_col in end_of_columns:
        print(f"\nFixed Effects Regression for {end_of_col}:")
        
        # Set the index as a MultiIndex with 'date'
        df_filtered[end_of_col] = df_filtered[end_of_col].astype(int)  # Convert True/False to 1/0
        
        # Prepare the dependent and independent variables
        dependent_var = df_filtered['citation_issued']
        independent_vars = sm.add_constant(df_filtered[[end_of_col]])
        independent_vars['year'] = df_filtered['year']
        
        # Create the PanelData structure
        panel_data = df_filtered[['date', 'citation_issued', end_of_col, 'year']]
        panel_data = panel_data.set_index(['date', 'year'])
        
        # Run the regression with TimeEffects
        model = PanelOLS.from_formula(f"citation_issued ~ {end_of_col} + TimeEffects", data=panel_data)
        result = model.fit()
        
        print(result)


******************************************************************************

City: durham, State: nc

Fixed Effects Regression for end_of_month:
                          PanelOLS Estimation Summary                           
Dep. Variable:        citation_issued   R-squared:                        0.0032
Estimator:                   PanelOLS   R-squared (Between):             -0.0228
No. Observations:                5112   R-squared (Within):               0.0000
Date:                Tue, Apr 25 2023   R-squared (Overall):             -0.0228
Time:                        23:04:02   Log-likelihood                -2.208e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      16.349
Entities:                        5112   P-value                           0.0001
Avg Obs:                       1.0000   Distribution:                  F(1,5096)
Min Obs:                       1.0000     

# Example of interpretation
## Cincinnati

In [84]:
df['date'] = pd.to_datetime(df['date'])

# Filter the data by city
city_name = 'cincinnati'
df_filtered = df[df['city'] == city_name]

## End of Month

In [85]:
# Set the index as a MultiIndex with 'date'
df_filtered['end_of_month'] = df_filtered['end_of_month'].astype(int)  # Convert True/False to 1/0

# Prepare the dependent and independent variables
dependent_var = df_filtered['citation_issued']
independent_vars = sm.add_constant(df_filtered[['end_of_month']])
independent_vars['year'] = df_filtered['year']

# Create the PanelData structure
panel_data = df_filtered[['date','citation_issued', 'end_of_month', 'year']]
panel_data = panel_data.set_index(['date', 'year'])
panel_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,citation_issued,end_of_month
date,year,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-01-01,2009,70,0
2009-01-02,2009,116,0
2009-01-03,2009,118,0
2009-01-04,2009,56,0
2009-01-05,2009,115,0


In [86]:
# Run the regression with TimeEffects
model = PanelOLS.from_formula("citation_issued ~ end_of_month + TimeEffects", data=panel_data)
result = model.fit()

print(result)

                          PanelOLS Estimation Summary                           
Dep. Variable:        citation_issued   R-squared:                        0.0050
Estimator:                   PanelOLS   R-squared (Between):              0.0216
No. Observations:                3428   R-squared (Within):               0.0000
Date:                Tue, Apr 25 2023   R-squared (Overall):              0.0216
Time:                        23:04:03   Log-likelihood                -1.526e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      17.131
Entities:                        3428   P-value                           0.0000
Avg Obs:                       1.0000   Distribution:                  F(1,3417)
Min Obs:                       1.0000                                           
Max Obs:                       1.0000   F-statistic (robust):             17.131
                            

The parameter estimate of 3.6956 indicates that, on average, there is an increase of 3.6956 `citation_issued` when it is the end of the month, holding all other variables constant. The p-value of 0.0000 suggests that this effect is statistically significant at 0.05 significance level meaning that there is strong evidence to conclude that there is a true positive relationship between `end_of_month` and `citation_issued`.

In summary, the results suggest that there is a statistically significant increase in the number of citations issued at the end of the month compared to other times, after accounting for time fixed effects.

## End of Quarter

In [87]:
# Set the index as a MultiIndex with 'date'
df_filtered['end_of_quarter'] = df_filtered['end_of_quarter'].astype(int)  # Convert True/False to 1/0

# Prepare the dependent and independent variables
dependent_var = df_filtered['citation_issued']
independent_vars = sm.add_constant(df_filtered[['end_of_quarter']])
independent_vars['year'] = df_filtered['year']

# Create the PanelData structure
panel_data = df_filtered[['date','citation_issued', 'end_of_quarter', 'year']]
panel_data = panel_data.set_index(['date', 'year'])
panel_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,citation_issued,end_of_quarter
date,year,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-01-01,2009,70,0
2009-01-02,2009,116,0
2009-01-03,2009,118,0
2009-01-04,2009,56,0
2009-01-05,2009,115,0


In [88]:
# Run the regression with TimeEffects
model = PanelOLS.from_formula("citation_issued ~ end_of_quarter + TimeEffects", data=panel_data)
result = model.fit()

print(result)

                          PanelOLS Estimation Summary                           
Dep. Variable:        citation_issued   R-squared:                        0.0094
Estimator:                   PanelOLS   R-squared (Between):             -0.0206
No. Observations:                3428   R-squared (Within):               0.0000
Date:                Tue, Apr 25 2023   R-squared (Overall):             -0.0206
Time:                        23:04:03   Log-likelihood                -1.525e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      32.388
Entities:                        3428   P-value                           0.0000
Avg Obs:                       1.0000   Distribution:                  F(1,3417)
Min Obs:                       1.0000                                           
Max Obs:                       1.0000   F-statistic (robust):             32.388
                            

The parameter estimate of -6.2315 indicates that, on average, there is a decrease of 6.2315 `citation_issued` when it is the end of the quarter, holding all other variables constant. The p-value of 0.0000 suggests that this effect is statistically significant at 0.05 significance level meaning that there is strong evidence to conclude that there is a true relationship between `end_of_quarter` and `citation_issued`.

In summary, the results suggest that there is a statistically significant decrease in the number of citations issued at the end of the quarter compared to other times, after accounting for time fixed effects.

## End of Year

In [89]:
# Set the index as a MultiIndex with 'date'
df_filtered['end_of_year'] = df_filtered['end_of_year'].astype(int)  # Convert True/False to 1/0

# Prepare the dependent and independent variables
dependent_var = df_filtered['citation_issued']
independent_vars = sm.add_constant(df_filtered[['end_of_year']])
independent_vars['year'] = df_filtered['year']

# Create the PanelData structure
panel_data = df_filtered[['date','citation_issued', 'end_of_year', 'year']]
panel_data = panel_data.set_index(['date', 'year'])
panel_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,citation_issued,end_of_year
date,year,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-01-01,2009,70,0
2009-01-02,2009,116,0
2009-01-03,2009,118,0
2009-01-04,2009,56,0
2009-01-05,2009,115,0


In [90]:
# Run the regression with TimeEffects
model = PanelOLS.from_formula("citation_issued ~ end_of_year + TimeEffects", data=panel_data)
result = model.fit()

print(result)

                          PanelOLS Estimation Summary                           
Dep. Variable:        citation_issued   R-squared:                     3.637e-05
Estimator:                   PanelOLS   R-squared (Between):             -0.0008
No. Observations:                3428   R-squared (Within):               0.0000
Date:                Tue, Apr 25 2023   R-squared (Overall):             -0.0008
Time:                        23:04:03   Log-likelihood                -1.527e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      0.1243
Entities:                        3428   P-value                           0.7245
Avg Obs:                       1.0000   Distribution:                  F(1,3417)
Min Obs:                       1.0000                                           
Max Obs:                       1.0000   F-statistic (robust):             0.1243
                            

The parameter estimate of -0.6257 indicates that, on average, there is a decrease of 0.6257 `citation_issued` when it is the end of the year, holding all other variables constant. The p-value of 0.7245 suggests that this effect is not statistically significant at 0.05 significance level meaning that there is insuficcient evidence to conclude that there is a true positive relationship between `end_of_year` and `citation_issued`.
