In [None]:
# ⚙️ Install Required Packages
!pip install linearmodels statsmodels --quiet

# 📚 Import Libraries
import pandas as pd
import numpy as np
from linearmodels.panel import PanelOLS, RandomEffects
from statsmodels.formula.api import ols
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# 📁 Load Data
df = pd.read_csv("Environment_Regional_Dataset__Unbalanced_.csv")
df.dropna(inplace=True)
df['RegionID'] = df['RegionID'].astype('category')
df['Year'] = pd.to_numeric(df['Year'])

# ⚠️ Check Panel Balance
obs_counts = df.groupby('RegionID')['Year'].nunique()
print("📊 Unbalanced Panel Check:\n", obs_counts.describe())

# 🧱 Set Panel Index
df = df.set_index(['RegionID', 'Year'])

# 🎯 Define Regression Formula
dependent = 'PollutionIndex'
independent_vars = ['ForestCover', 'AvgTemperature', 'Rainfall']
formula = dependent + ' ~ ' + ' + '.join(independent_vars)

# 🔁 Fixed Effects Model
fe_model = PanelOLS.from_formula(formula + ' + EntityEffects', data=df).fit()
print("\n📌 Fixed Effects Summary:\n", fe_model)

# 🔁 Random Effects Model
re_model = RandomEffects.from_formula(formula, data=df).fit()
print("\n📌 Random Effects Summary:\n", re_model)

# 🧪 Hausman Test
def hausman(fe, re):
    b = fe.params[independent_vars]
    B = re.params[independent_vars]
    v_b = fe.cov.loc[independent_vars, independent_vars]
    v_B = re.cov.loc[independent_vars, independent_vars]
    stat = (b - B).T @ np.linalg.inv(v_b - v_B) @ (b - B)
    pval = 1 - stats.chi2.cdf(stat, len(b))
    return stat, pval

hausman_stat, p_val = hausman(fe_model, re_model)
print(f"\n🔍 Hausman Test Statistic: {hausman_stat:.3f}, p-value: {p_val:.3f}")
print("➡️ Use Fixed Effects" if p_val < 0.05 else "➡️ Use Random Effects")


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.7/1.7 MB[0m [31m91.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m46.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/115.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.7/115.7 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.9/43.9 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h

  obs_counts = df.groupby('RegionID')['Year'].nunique()
  group_mu = self._frame.groupby(level=level).transform("mean")
  group_mu = self._frame.groupby(level=level).transform("mean")
  mu = self._frame.groupby(level=level).mean()
  mu = self._frame.groupby(level=level).mean()
  out = self._frame.groupby(level=level).count()
  mu = self._frame.groupby(level=level).mean()
  group_mu = self._frame.groupby(level=level).transform("mean")
  mu = self._frame.groupby(level=level).mean()
  mu = self._frame.groupby(level=level).mean()
  group_mu = self._frame.groupby(level=level).transform("mean")
  weighted_sum: DataFrame = frame.groupby(level=level).transform("sum")
  sum_weights: DataFrame = frame.groupby(level=level).transform("sum")
  weighted_sum: DataFrame = frame.groupby(level=level).transform("sum")


📊 Unbalanced Panel Check:
 count    60.000000
mean     10.200000
std       4.649768
min       5.000000
25%       7.000000
50%       8.000000
75%      14.000000
max      25.000000
Name: Year, dtype: float64

📌 Fixed Effects Summary:
                           PanelOLS Estimation Summary                           
Dep. Variable:         PollutionIndex   R-squared:                        0.0027
Estimator:                   PanelOLS   R-squared (Between):             -0.1025
No. Observations:                 612   R-squared (Within):               0.0027
Date:                Thu, May 29 2025   R-squared (Overall):             -0.0944
Time:                        20:35:09   Log-likelihood                   -2502.1
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      0.4979
Entities:                          60   P-value                           0.6839
Avg Obs:                       10.200 

  sum_weights: DataFrame = frame.groupby(level=level).transform("sum")
  weighted_sum = frame.groupby(level=level).sum()
  sum_weights = frame.groupby(level=level).sum()
  weighted_sum = frame.groupby(level=level).sum()
  sum_weights = frame.groupby(level=level).sum()
  out = self._frame.groupby(level=level).count()
  mu = self._frame.groupby(level=level).mean()
  mu = self._frame.groupby(level=level).mean()
  out = self._frame.groupby(level=level).count()
  mu = self._frame.groupby(level=level).mean()
  group_mu = self._frame.groupby(level=level).transform("mean")
  mu = self._frame.groupby(level=level).mean()
  mu = self._frame.groupby(level=level).mean()
  group_mu = self._frame.groupby(level=level).transform("mean")


In [None]:
print("\n--- FIXED EFFECTS MODEL ---")
print(fe_model.summary)

print("\n--- RANDOM EFFECTS MODEL ---")
print(re_model.summary)

hausman_stat, p_val = hausman(fe_model, re_model)
print(f"\nHausman test: stat={hausman_stat:.3f}, p={p_val:.3f}")
print("➡️ Use Fixed Effects" if p_val < 0.05 else "➡️ Use Random Effects")


--- FIXED EFFECTS MODEL ---
                          PanelOLS Estimation Summary                           
Dep. Variable:         PollutionIndex   R-squared:                        0.0027
Estimator:                   PanelOLS   R-squared (Between):             -0.1025
No. Observations:                 612   R-squared (Within):               0.0027
Date:                Thu, May 29 2025   R-squared (Overall):             -0.0944
Time:                        20:35:09   Log-likelihood                   -2502.1
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      0.4979
Entities:                          60   P-value                           0.6839
Avg Obs:                       10.200   Distribution:                   F(3,549)
Min Obs:                       5.0000                                           
Max Obs:                       25.000   F-statistic (robust):             0.4979