In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso

DATA_PATH   = "CES_RF_SMT_clean.csv"
OUTCOME_VAR = "lc_w13"
WEIGHT_VAR  = "wgt"

FEATURES = [
    "Highschool_Educated",
    "Tertiary_Educated",
    "Age",
    "age2",
    "Household_Size",
    "Log_Household_Net_Income",
    "Male",
    "Growth_Uncertainty_Probability",
    "pr2010D",
    "pr2110D",
    "First_Moment_Expectation_Prior",
    "Second_Moment_Prior",
    "Belgian", "Spanish", "French", "Italian", "Dutch",
]


rename_map = {
    # user-specified renames
    "pr2010D": "Planned to Increase Consumption on Non-Durable Goods",
    "pr2110D": "Planned to Increase Consumption on Durable Goods",
    "age2": "Age Squared",
    "First_Moment_Expectation_Prior": "Mean Expectations Prior",
    "Second_Moment_Prior": "Uncertainty of Expectations Prior",
    "Log_Household_Net_Income": "Log Household Net Income",
    "Growth_Uncertainty_Probability": "Growth Uncertainty Probability",
    # treatment rename (kept even if not in FEATURES)
    "Second_Moment_Treatment": "Information Treatment",
}

# apply renames
df = df.rename(columns=rename_map)

# Update FEATURES list to reflect renamed columns
FEATURES = [rename_map.get(f, f) for f in FEATURES]
cols_needed = [OUTCOME_VAR, WEIGHT_VAR] + FEATURES
df = df[cols_needed].dropna()

X = df[FEATURES].values
y = df[OUTCOME_VAR].values.reshape(-1, 1)
w = df[WEIGHT_VAR].values

kf = KFold(n_splits=5, shuffle=True, random_state=42)

# LASSO tends to zero out quickly for big lambdas, so keep upper bound moderate
lambdas = np.logspace(-4, 1, 50)
cv_errors = []

for lam in lambdas:
    fold_errs = []
    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        w_train, w_val = w[train_idx], w[val_idx]

        x_scaler = StandardScaler()
        X_train_s = x_scaler.fit_transform(X_train)
        X_val_s   = x_scaler.transform(X_val)

        y_scaler = StandardScaler()
        y_train_s = y_scaler.fit_transform(y_train).ravel()
        y_val_s   = y_scaler.transform(y_val).ravel()

        model = Lasso(alpha=lam, fit_intercept=False, max_iter=10000)
        model.fit(X_train_s, y_train_s, sample_weight=w_train)

        y_pred_s = model.predict(X_val_s)
        mse_w = np.average((y_val_s - y_pred_s) ** 2, weights=w_val)
        fold_errs.append(mse_w)

    cv_errors.append(np.mean(fold_errs))

cv_errors = np.array(cv_errors)
best_idx = np.argmin(cv_errors)
best_lambda = lambdas[best_idx]
print("Best lambda (weighted LASSO CV):", best_lambda)

plt.figure(figsize=(8, 5))
plt.semilogx(lambdas, cv_errors, marker="o")
plt.axvline(best_lambda, linestyle="--")
plt.xlabel(r"$\lambda$ (LASSO penalty)")
plt.ylabel("Weighted CV MSE")
plt.title("Weighted LASSO cross-validation curve for Log Consumption")
plt.tight_layout()
plt.show()

OUT_PDF = "P2_LASSO_Cross_Validation_Curve.pdf"
plt.savefig(OUT_PDF)
print(f"Ridge coefficient path PDF saved to: {OUT_PDF}")


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor, plot_tree

DATA_PATH   = "CES_RF_SMT_clean.csv"
OUTCOME_VAR = "lc_w13"
WEIGHT_VAR  = "wgt"

FEATURES = [
    "Highschool_Educated",
    "Tertiary_Educated",
    "Age",
    "age2",
    "Household_Size",
    "Log_Household_Net_Income",
    "Male",
    "Growth_Uncertainty_Probability",
    "pr2010D",
    "pr2110D",
    "First_Moment_Expectation_Prior",
    "Second_Moment_Prior",
    "Belgian", "Spanish", "French", "Italian", "Dutch",
    "Second_Moment_Treatment",
]

df = pd.read_csv(DATA_PATH)

# ---------- NEW SECTION: RENAME VARIABLES ----------
rename_map = {
    # user-specified renames
    "pr2010D": "Planned to Increase Consumption on Non-Durable Goods",
    "pr2110D": "Planned to Increase Consumption on Durable Goods",
    "age2": "Age Squared",
    "First_Moment_Expectation_Prior": "Mean Expectations Prior",
    "Second_Moment_Prior": "Uncertainty of Expectations Prior",
    "Log_Household_Net_Income": "Log Household Net Income",
    "Growth_Uncertainty_Probability": "Growth Uncertainty Probability",
    # treatment rename (kept even if not in FEATURES)
    "Second_Moment_Treatment": "Information Treatment",
}

# apply renames
df = df.rename(columns=rename_map)
# Update FEATURES list to reflect renamed columns
FEATURES = [rename_map.get(f, f) for f in FEATURES]

cols_needed = [OUTCOME_VAR, WEIGHT_VAR] + FEATURES
df = df[cols_needed].dropna()

X = df[FEATURES].values
y = df[OUTCOME_VAR].values
w = df[WEIGHT_VAR].values

tree = DecisionTreeRegressor(
    max_depth=4,
    random_state=42
)

tree.fit(X, y, sample_weight=w)

plt.figure(figsize=(20, 9))
plot_tree(
    tree,
    feature_names=FEATURES,
    filled=True,
    rounded=True,
    fontsize=8
)
plt.title("Regression tree for Log Consumption (max depth = 4)")
plt.tight_layout()

OUT_PDF = "P2_Regression_Tree.pdf"
# save BEFORE showing; use bbox_inches='tight' for nicer margins
plt.savefig(OUT_PDF, dpi=300, bbox_inches="tight")
print(f"Regression tree PDF saved to: {OUT_PDF}")

# then show (optional) and close figure
plt.show()
plt.close()


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor

DATA_PATH   = "CES_RF_SMT_clean.csv"
OUTCOME_VAR = "lc_w13"
WEIGHT_VAR  = "wgt"

FEATURES = [
    "Highschool_Educated",
    "Tertiary_Educated",
    "Age",
    "age2",
    "Household_Size",
    "Log_Household_Net_Income",
    "Male",
    "Growth_Uncertainty_Probability",
    "pr2010D",
    "pr2110D",
    "First_Moment_Expectation_Prior",
    "Second_Moment_Prior",
    "Belgian", "Spanish", "French", "Italian", "Dutch",
    "Second_Moment_Treatment"
]

df = pd.read_csv(DATA_PATH)

# ---------- NEW SECTION: RENAME VARIABLES ----------
rename_map = {
    "pr2010D": "Planned to Increase Consumption on Non-Durable Goods",
    "pr2110D": "Planned to Increase Consumption on Durable Goods",
    "age2": "Age Squared",
    "First_Moment_Expectation_Prior": "Mean Expectations Prior",
    "Second_Moment_Prior": "Uncertainty of Expectations Prior",
    "Log_Household_Net_Income": "Log Household Net Income",
    "Growth_Uncertainty_Probability": "Growth Uncertainty Probability",
    "Second_Moment_Treatment": "Information Treatment",
}

# apply renames
df = df.rename(columns=rename_map)
FEATURES = [rename_map.get(f, f) for f in FEATURES]

cols_needed = [OUTCOME_VAR, WEIGHT_VAR] + FEATURES
df = df[cols_needed].dropna()

X = df[FEATURES].values
y = df[OUTCOME_VAR].values
w = df[WEIGHT_VAR].values

rf = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
rf.fit(X, y, sample_weight=w)

importances = rf.feature_importances_
imp_df = pd.DataFrame({
    "variable": FEATURES,
    "importance": importances
}).sort_values("importance", ascending=False)

print(imp_df)

# sort ascending for barh so largest is at top
imp_df_plot = imp_df.sort_values("importance", ascending=True)

plt.figure(figsize=(8, 6))
plt.barh(imp_df_plot["variable"], imp_df_plot["importance"])
plt.xlabel("Random forest importance")
plt.title("Random Forest Variable Importance for Log Consumption")
plt.gca().invert_yaxis()  # optional: keeps consistent orientation
plt.tight_layout()

OUT_PDF = "P2_Random_Forest_Importance.pdf"
plt.savefig(OUT_PDF, dpi=300, bbox_inches="tight")
print(f"Random forest importance PDF saved to: {OUT_PDF}")

plt.show()
plt.close()
