In [1]:
import pandas as pd
df = pd.read_csv('dataset_aapl_amzn.csv')

In [2]:
!pip install linearmodels

Collecting linearmodels
  Downloading linearmodels-7.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (10 kB)
Collecting mypy_extensions>=0.4 (from linearmodels)
  Downloading mypy_extensions-1.1.0-py3-none-any.whl.metadata (1.1 kB)
Collecting pyhdfe>=0.1 (from linearmodels)
  Downloading pyhdfe-0.2.0-py3-none-any.whl.metadata (4.0 kB)
Collecting formulaic>=1.2.1 (from linearmodels)
  Downloading formulaic-1.2.1-py3-none-any.whl.metadata (7.0 kB)
Collecting interface-meta>=1.2.0 (from formulaic>=1.2.1->linearmodels)
  Downloading interface_meta-1.3.0-py3-none-any.whl.metadata (6.7 kB)
Downloading linearmodels-7.0-cp311-cp311-macosx_11_0_arm64.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m12.9 MB/s[0m  [33m0:00:00[0m
[?25hDownloading formulaic-1.2.1-py3-none-any.whl (117 kB)
Downloading interface_meta-1.3.0-py3-none-any.whl (14 kB)
Downloading mypy_extensions-1.1.0-py3-none-any.whl (5.0 kB)
Downloading pyhdfe-0.2.0-py3-none-any.whl (19 kB

In [None]:
# If linearmodels is not installed, run first: !pip install linearmodels
from linearmodels.panel import PanelOLS
import pandas as pd

# ==========================================
# Step 1: Prepare panel data index
# ==========================================
# PanelOLS requires index to be in [entity, time] format
# df_final is the final table generated from the previous data cleaning step
df['date'] = pd.to_datetime(df['date'])
df_reg = df.set_index(['stock_name', 'date'])

# ==========================================
# Step 2: Define regression formula
# ==========================================
# Explanation:
# Log_Volume      : Dependent variable (Y)
# 1               : Intercept
# Sentiment_Lag1  : Core variable 1 (retail sentiment)
# Log_Buzz_Lag1   : Core variable 2 (discussion buzz/news attention)
# Return_Abs_Lag1 : Control variable (previous day's absolute return - attracts attention)
# Volatility_Lag1 : Control variable (previous day's volatility - opinion divergence)
# RiskAppetite    : Macro control (market risk appetite)
# Fed_Rate        : Macro control (interest rate environment)
# unemployment    : Macro control (economic fundamentals)
# FOMC            : Event control (Fed meeting days)
# EntityEffects   : Entity fixed effects (controls for inherent trading volume differences between AAPL and AMZN)

formula = """
Log_Volume ~ 1 + 
             Sentiment_Lag1 + 
             Log_Buzz_Lag1 + 
             Return_Abs_Lag1 + 
             Volatility_Lag1 + 
             RiskAppetite + 
             Fed_Rate + 
             unemployment + 
             FOMC + 
             EntityEffects
"""

# ==========================================
# Step 3: Run the model
# ==========================================
# drop_absorbed=True: Automatically drop macro variables that are fully absorbed by fixed effects (avoids errors)
mod = PanelOLS.from_formula(formula, data=df_reg, drop_absorbed=True)

# Use 'clustered' standard errors (clustered by Entity)
# This prevents serial correlation within the same stock from inflating P-values, making results more robust
res = mod.fit(cov_type='clustered', cluster_entity=True)

# ==========================================
# Step 4: View results
# ==========================================
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:             Log_Volume   R-squared:                        0.4445
Estimator:                   PanelOLS   R-squared (Between):              0.0384
No. Observations:                1087   R-squared (Within):               0.4445
Date:                Sat, Nov 29 2025   R-squared (Overall):              0.3770
Time:                        02:14:42   Log-likelihood                   -247.50
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      107.74
Entities:                           2   P-value                           0.0000
Avg Obs:                       543.50   Distribution:                  F(8,1077)
Min Obs:                       541.00                                           
Max Obs:                       546.00   F-statistic (robust):         -7.763e+15
                            

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)
