
# Quant — Option C (Remix)
Minimal notebook that loads **World Data 2.0** and runs one method: **summary+correlation**, **regression**, **clustering**, or **pca**.


In [None]:

# =========================
# CONFIG (EDIT HERE)
# =========================
DATA_URL = "https://github.com/zachtilton/intermediate_analytics_ai/blob/main/World%20Data%202.0%20-%20Data.csv"

# Choose ONE method: "summary+correlation" | "regression" | "clustering" | "pca"
METHOD = "summary+correlation"

# Columns (defaults chosen from header you provided)
COUNTRY_COL = "Country"
MEASURES = [
    "GDP per Capita (2018)",
    "Life expectancy at birth (years)(2018)",
    "Life satisfaction in Cantril Ladder (2018)",
]

# Only used if METHOD == "regression"
REGRESSION_TARGET = "Life satisfaction in Cantril Ladder (2018)"

# Only used if METHOD == "clustering"
K = 4  # sensible default k=3..6

# Only used if METHOD == "pca"
N_COMPONENTS = 2  # 2 components for easy plotting

# Optional: filter to subset of countries (must match values in COUNTRY_COL)
COUNTRY_FILTER = None  # e.g., ["Sweden", "Japan"]

print("Config set. Edit METHOD/columns above as needed.")


In [None]:

import pandas as pd
import numpy as np

def to_raw_github_url(url: str) -> str:
    # Convert GitHub page URL to raw content if needed
    if "github.com" in url and "raw.githubusercontent.com" not in url:
        url = url.replace("github.com/", "raw.githubusercontent.com/")
        url = url.replace("/blob/", "/")
    return url

pd.set_option("display.width", 120)
pd.set_option("display.max_columns", 50)


In [None]:

RAW_URL = to_raw_github_url(DATA_URL)
df = pd.read_csv(RAW_URL)

print("Loaded shape:", df.shape)
print("Columns:")
print(list(df.columns))

print("\nPreview:")
print(df.head(3).to_string(index=False))


In [None]:

# Validate presence of columns
missing = [c for c in [COUNTRY_COL, *MEASURES] if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns: {missing}. Please edit COUNTRY_COL/MEASURES to match the CSV headers above.")

# Optional filter
if COUNTRY_FILTER:
    df = df[df[COUNTRY_COL].astype(str).isin(COUNTRY_FILTER)].copy()

# Keep only needed cols for analysis (plus COUNTRY_COL)
use_cols = [COUNTRY_COL] + MEASURES
work = df[use_cols].dropna().copy()

print("Working shape after dropna on selected measures:", work.shape)


In [None]:

# Run the selected method
result_scatter = None    # Will hold a DataFrame with columns: Country, x, y [, group]
pca_loadings = None      # Only if METHOD == "pca"

method = METHOD.strip().lower()

if method == "summary+correlation":
    # 1) Summary stats
    summary = work[MEASURES].describe().T
    print("\n[SUMMARY]")
    print(summary.to_string())

    # 2) Correlation matrix
    corr = work[MEASURES].corr()
    print("\n[CORRELATION]")
    print(corr.round(3).to_string())

    # 3) result_scatter (first two measures as x,y)
    x_col, y_col = MEASURES[0], (MEASURES[1] if len(MEASURES) > 1 else MEASURES[0])
    result_scatter = work[[COUNTRY_COL, x_col, y_col]].rename(columns={COUNTRY_COL:"Country", x_col:"x", y_col:"y"}).copy()
    print("\n[result_scatter preview]")
    print(result_scatter.head(10).to_string(index=False))

elif method == "regression":
    # Features = MEASURES excluding target
    if REGRESSION_TARGET not in work.columns:
        raise ValueError(f"REGRESSION_TARGET '{REGRESSION_TARGET}' not found in data.")

    features = [m for m in MEASURES if m != REGRESSION_TARGET]
    if len(features) == 0:
        raise ValueError("Need at least one feature (MEASURES excluding REGRESSION_TARGET).")

    try:
        from sklearn.linear_model import LinearRegression
    except Exception as e:
        raise RuntimeError("scikit-learn is required for regression. Try installing it first (e.g., pip install scikit-learn).") from e

    X = work[features].values
    y = work[REGRESSION_TARGET].values
    model = LinearRegression().fit(X, y)
    coefs = dict(zip(features, model.coef_.tolist()))
    intercept = float(model.intercept_)
    r2 = float(model.score(X, y))

    print("\n[REGRESSION RESULTS]")
    print("Intercept:", intercept)
    print("Coefficients:")
    for k,v in coefs.items():
        print(f"  {k}: {v:.4f}")
    print(f"R^2: {r2:.3f}")
    print("\n[INTERPRETATION] Higher absolute coefficients indicate stronger linear influence on the target given selected features.")

    # result_scatter uses first two features as x,y (if only one feature, y becomes target for simple scatter)
    if len(features) >= 2:
        x_col, y_col = features[0], features[1]
    else:
        x_col, y_col = features[0], REGRESSION_TARGET
    result_scatter = work[[COUNTRY_COL, x_col, y_col]].rename(columns={COUNTRY_COL:"Country", x_col:"x", y_col:"y"}).copy()
    print("\n[result_scatter preview]")
    print(result_scatter.head(10).to_string(index=False))

elif method == "clustering":
    # Requires at least two measures
    if len(MEASURES) < 2:
        raise ValueError("Clustering needs at least two measures in MEASURES.")
    try:
        from sklearn.cluster import KMeans
    except Exception as e:
        raise RuntimeError("scikit-learn is required for clustering. Try installing it first (e.g., pip install scikit-learn).") from e

    X = work[MEASURES].values
    km = KMeans(n_clusters=K, n_init=10, random_state=42).fit(X)
    labels = km.labels_
    print("\n[CLUSTERING]")
    unique, counts = np.unique(labels, return_counts=True)
    for u,c in zip(unique, counts):
        print(f"cluster {u}: {c} rows")
    print("Interpretation: clusters group countries with similar measure profiles.")

    # Scatter on first two measures, add cluster label
    x_col, y_col = MEASURES[0], MEASURES[1]
    result_scatter = (
        work[[COUNTRY_COL, x_col, y_col]].assign(group=labels)
        .rename(columns={COUNTRY_COL:"Country", x_col:"x", y_col:"y"})
        .copy()
    )
    print("\n[result_scatter preview]")
    print(result_scatter.head(10).to_string(index=False))

elif method == "pca":
    if len(MEASURES) < 2:
        raise ValueError("PCA needs at least two measures in MEASURES.")
    try:
        from sklearn.decomposition import PCA
    except Exception as e:
        raise RuntimeError("scikit-learn is required for PCA. Try installing it first (e.g., pip install scikit-learn).") from e

    X = work[MEASURES].values
    pca = PCA(n_components=N_COMPONENTS, random_state=42).fit(X)
    explained = pca.explained_variance_ratio_
    comps = pca.components_
    print("\n[PCA] Explained variance ratio:", [round(v,3) for v in explained])

    # Loadings per component
    pca_loadings = []
    for i, comp in enumerate(comps):
        row = dict(zip(MEASURES, [float(v) for v in comp]))
        pca_loadings.append({"component": i+1, "loadings": row})
    print("\n[Loadings]")
    for row in pca_loadings:
        print(f"Component {row['component']}:")
        for k,v in row["loadings"].items():
            print(f"  {k}: {v:.3f}")

    # 2D result_scatter using first two PCs (pad with zeros if only 1 component requested)
    X_p = pca.transform(X)
    x = X_p[:,0]
    y = X_p[:,1] if X_p.shape[1] > 1 else np.zeros_like(x)
    result_scatter = pd.DataFrame({
        "Country": work[COUNTRY_COL].values,
        "x": x,
        "y": y
    })
    print("\n[result_scatter preview]")
    print(result_scatter.head(10).to_string(index=False))

else:
    raise ValueError(f"Unknown METHOD: {METHOD}")


In [None]:

# Simple key finding heuristic for scatter-ready results
try:
    r = np.corrcoef(result_scatter["x"], result_scatter["y"])[0,1]
    desc = "positive" if r >= 0 else "negative"
    strength = "weak" if abs(r) < 0.3 else "moderate" if abs(r) < 0.6 else "strong"
    print(f"[KEY FINDING] Scatter x vs y shows a {strength} {desc} relationship (r ≈ {r:.2f}).")
except Exception as e:
    print("Key finding calculation skipped:", e)

print("\nDone. You can now explore `result_scatter` (and `pca_loadings` if PCA).")
