In [8]:
import pandas as pd
import statsmodels.api as sm
import numpy as np

df = pd.read_csv("data_soc_imm.csv", thousands=",") # number parsing
df["_csv_line"] = df.index + 2   # +2 for header offset

cols = ["soc_cap", "imm", "pop", "gdp_per_capita", "u_rate", "land_area", "pop_den"]

# Parse (str -> num)
for col in cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# Replace inf with NaN
df = df.replace([np.inf, -np.inf], np.nan)

# Check for NaN
bad_rows = df[df[cols].isna().any(axis=1)]
if not bad_rows.empty:
    print("Found problematic rows in your CSV (NaN or invalid values):")
    print(bad_rows[["_csv_line"] + cols])
    exit()

# OLS
y = df["soc_cap"]
X = df[["imm", "pop", "gdp_per_capita", "u_rate", "land_area", "pop_den"]]
X = sm.add_constant(X)

model = sm.OLS(y, X).fit()
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:                soc_cap   R-squared:                       0.180
Model:                            OLS   Adj. R-squared:                  0.144
Method:                 Least Squares   F-statistic:                     5.001
Date:                Wed, 01 Oct 2025   Prob (F-statistic):           0.000116
Time:                        21:11:47   Log-Likelihood:                -214.72
No. Observations:                 144   AIC:                             443.4
Df Residuals:                     137   BIC:                             464.2
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const              0.9363      0.611      1.

In [9]:
# Simplified Regression
y = df["soc_cap"]
X = df[["imm", "pop", "u_rate"]]
X = sm.add_constant(X)

model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                soc_cap   R-squared:                       0.135
Model:                            OLS   Adj. R-squared:                  0.117
Method:                 Least Squares   F-statistic:                     7.298
Date:                Wed, 01 Oct 2025   Prob (F-statistic):           0.000139
Time:                        21:11:51   Log-Likelihood:                -218.52
No. Observations:                 144   AIC:                             445.0
Df Residuals:                     140   BIC:                             456.9
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.3268      0.287      1.139      0.2