---
title: "DS202W - Group Project"
author: "Civic Tensor (Group 2)"
output: html
self-contained: true
jupyter: python3
engine: jupyter
editor:
  render-on-save: true
  preview: true
---

In [28]:
# Import relevant libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, TimeSeriesSplit, GridSearchCV, HalvingGridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, root_mean_squared_error, roc_auc_score, roc_curve, f1_score, fbeta_score, \
                            precision_score, recall_score, precision_recall_curve, average_precision_score, make_scorer, r2_score, mean_squared_error, \
                            mean_absolute_error, mean_squared_log_error, median_absolute_error, classification_report, roc_curve, auc
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LinearRegression, BayesianRidge, Lasso, Ridge, ElasticNet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.experimental import enable_iterative_imputer
from sklearn.exceptions import FitFailedWarning
from sklearn.impute import IterativeImputer
from sklearn.pipeline import make_pipeline, Pipeline
import statsmodels.api as sm
from lets_plot import *
LetsPlot.setup_html()
from lets_plot.plot import gggrid
from pytablewriter import MarkdownTableWriter
import matplotlib.pyplot as plt
import xgboost as xgb
import lightgbm as lgb
import catboost
import missingno as msno
import seaborn as sns
from sklearn.svm import SVR
import sweetviz as sv
from collections import Counter
import random
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import warnings
from plotnine.themes.themeable import themeable
from plotnine import ggplot, aes, geom_bar, theme_minimal, options

In [29]:
# Open the data file
df = pd.read_csv('ds202w-group-projects-civil-war.csv')

In [30]:
df


Unnamed: 0.1,Unnamed: 0,cowcode,year,warstds,ager,agexp,anoc,army85,autch98,auto4,...,seceduc,second,semipol3,sip2,sxpnew,sxpsq,tnatwar,trade,warhist,xconst
0,1,700,1945,0,34.461765,8.510845,0,129472.9042,0,3.925812,...,43.770298,0.253000,0.058441,0.46176,0.158275,0.052989,0.443259,72.881375,0,3.995912
1,2,700,1946,0,34.346348,8.478997,0,129413.0225,0,10.000000,...,43.588363,0.253000,0.000000,0.00000,0.158321,0.052663,1.000000,72.900089,0,1.000000
2,3,700,1947,0,77.000000,8.481015,0,130431.0145,0,10.000000,...,43.538181,0.253000,0.000000,0.00000,0.158425,0.052891,2.000000,72.962880,0,1.000000
3,4,700,1948,0,78.000000,8.451628,0,126781.6866,0,10.000000,...,43.490005,0.253000,0.000000,0.00000,0.159006,0.052902,2.000000,73.102449,0,1.000000
4,5,700,1949,0,79.000000,8.500172,0,130979.2470,0,10.000000,...,43.602238,0.253000,0.000000,0.00000,0.158074,0.052706,2.000000,72.850389,0,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7135,7136,552,1996,0,31.000000,6.760000,0,46000.0000,0,6.000000,...,47.299999,0.200000,0.058530,0.35749,0.175017,0.030631,0.000000,78.975510,0,3.000000
7136,7137,552,1997,0,32.000000,7.620000,0,46000.0000,0,6.000000,...,47.299999,0.200000,0.058706,0.35749,0.194624,0.037878,0.000000,72.737373,0,3.000000
7137,7138,552,1998,0,33.000000,10.590000,0,46000.0000,0,6.000000,...,47.299999,0.200000,0.058580,0.35749,0.158606,0.052775,0.000000,79.235222,0,3.000000
7138,7139,552,1999,0,34.000000,8.514664,0,46000.0000,-1,6.000000,...,47.299999,0.200000,0.058705,0.35749,0.157868,0.052714,0.000000,72.858179,0,3.000000


In [31]:
# Filter dataset to only include years between 1945 and 2000
df_filtered = df[(df["year"] >= 1945) & (df["year"] <= 2000)]

# Create full index of expected (cowcode, year) combinations in this range
expected_years = range(1945, 2001)
full_index_filtered = pd.MultiIndex.from_product([df_filtered["cowcode"].unique(), expected_years], names=["cowcode", "year"])
full_df_filtered = pd.DataFrame(index=full_index_filtered).reset_index()

# Merge with the actual data
merged_filtered = pd.merge(full_df_filtered, df_filtered, on=["cowcode", "year"], how="left")

# Identify missing combinations (i.e., missing years for specific cowcodes)
missing_filtered = merged_filtered[merged_filtered.isnull().any(axis=1)]

# Count number of missing years between 1945 and 2000 for each cowcode
missing_years_summary = missing_filtered.groupby("cowcode").size().reset_index(name="missing_years_1945_2000")

missing_years_summary


Unnamed: 0,cowcode,missing_years_1945_2000
0,31,28
1,51,17
2,52,17
3,53,21
4,55,29
...,...,...
114,910,30
115,935,36
116,940,33
117,950,25


In [32]:
cow_map = pd.read_csv("COW-country-codes.csv")

# Rename columns for clarity
cow_map = cow_map.rename(columns={"CCode": "cowcode", "StateNme": "country_name"})

# Merge to add country_name column
df = df.merge(cow_map[["cowcode", "country_name"]], on="cowcode", how="left")

# Generate all expected (cowcode, year) combinations
full_index = pd.MultiIndex.from_product([df_filtered["cowcode"].unique(), range(1945, 2001)], names=["cowcode", "year"])
full_df = pd.DataFrame(index=full_index).reset_index()

# Merge to find missing entries
merged_df = pd.merge(full_df, df_filtered, on=["cowcode", "year"], how="left")

# Add country names to merged_df
merged_df = merged_df.merge(cow_map[["cowcode", "country_name"]], on="cowcode", how="left")

# Filter for rows with missing data (i.e., missing years)
missing_years_df = merged_df[merged_df.isnull().any(axis=1)]

# Count missing years per country
missing_summary = missing_years_df.groupby(["cowcode", "country_name"]).size().reset_index(name="missing_years_1945_2000")

In [33]:
missing_summary

Unnamed: 0,cowcode,country_name,missing_years_1945_2000
0,31,Bahamas,28
1,51,Jamaica,17
2,52,Trinidad and Tobago,17
3,53,Barbados,21
4,55,Grenada,29
...,...,...,...
111,910,Papua New Guinea,30
112,935,Vanuatu,36
113,940,Solomon Islands,33
114,950,Fiji,25


In [34]:
missing_summary.to_csv('missing_years.csv')

In [35]:
merged_df

Unnamed: 0.1,cowcode,year,Unnamed: 0,warstds,ager,agexp,anoc,army85,autch98,auto4,...,second,semipol3,sip2,sxpnew,sxpsq,tnatwar,trade,warhist,xconst,country_name
0,700,1945,1.0,0.0,34.461765,8.510845,0.0,129472.9042,0.0,3.925812,...,0.253000,0.058441,0.46176,0.158275,0.052989,0.443259,72.881375,0.0,3.995912,Afghanistan
1,700,1946,2.0,0.0,34.346348,8.478997,0.0,129413.0225,0.0,10.000000,...,0.253000,0.000000,0.00000,0.158321,0.052663,1.000000,72.900089,0.0,1.000000,Afghanistan
2,700,1947,3.0,0.0,77.000000,8.481015,0.0,130431.0145,0.0,10.000000,...,0.253000,0.000000,0.00000,0.158425,0.052891,2.000000,72.962880,0.0,1.000000,Afghanistan
3,700,1948,4.0,0.0,78.000000,8.451628,0.0,126781.6866,0.0,10.000000,...,0.253000,0.000000,0.00000,0.159006,0.052902,2.000000,73.102449,0.0,1.000000,Afghanistan
4,700,1949,5.0,0.0,79.000000,8.500172,0.0,130979.2470,0.0,10.000000,...,0.253000,0.000000,0.00000,0.158074,0.052706,2.000000,72.850389,0.0,1.000000,Afghanistan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11419,552,1996,7136.0,0.0,31.000000,6.760000,0.0,46000.0000,0.0,6.000000,...,0.200000,0.058530,0.35749,0.175017,0.030631,0.000000,78.975510,0.0,3.000000,Zimbabwe
11420,552,1997,7137.0,0.0,32.000000,7.620000,0.0,46000.0000,0.0,6.000000,...,0.200000,0.058706,0.35749,0.194624,0.037878,0.000000,72.737373,0.0,3.000000,Zimbabwe
11421,552,1998,7138.0,0.0,33.000000,10.590000,0.0,46000.0000,0.0,6.000000,...,0.200000,0.058580,0.35749,0.158606,0.052775,0.000000,79.235222,0.0,3.000000,Zimbabwe
11422,552,1999,7139.0,0.0,34.000000,8.514664,0.0,46000.0000,-1.0,6.000000,...,0.200000,0.058705,0.35749,0.157868,0.052714,0.000000,72.858179,0.0,3.000000,Zimbabwe


In [36]:
# Step 1: Filter merged_df for missing entries in 1945–2000
missing_entries = merged_df[
    (merged_df["year"].between(1945, 2000)) & 
    (merged_df.isnull().any(axis=1))
][["cowcode", "country_name", "year"]]

# Ensure years are integers and drop duplicates
missing_entries["year"] = missing_entries["year"].astype(int)
missing_entries = missing_entries.drop_duplicates()

# Step 2: Clean range builder function
def get_missing_ranges(years):
    years = sorted(set(years))
    ranges = []
    start = prev = years[0]

    for year in years[1:]:
        if year == prev + 1:
            prev = year
        else:
            ranges.append(f"{start}–{prev}" if start != prev else f"{start}")
            start = prev = year
    ranges.append(f"{start}–{prev}" if start != prev else f"{start}")
    return ", ".join(ranges)

# Step 3: Apply per group
missing_ranges = (
    missing_entries.groupby(["cowcode", "country_name"])["year"]
    .apply(get_missing_ranges)
    .reset_index(name="missing_year_ranges")
)

# Step 4: Merge with missing_summary
final_missing_summary = missing_summary.merge(
    missing_ranges,
    on=["cowcode", "country_name"],
    how="left"
)


In [37]:
merged_df

Unnamed: 0.1,cowcode,year,Unnamed: 0,warstds,ager,agexp,anoc,army85,autch98,auto4,...,second,semipol3,sip2,sxpnew,sxpsq,tnatwar,trade,warhist,xconst,country_name
0,700,1945,1.0,0.0,34.461765,8.510845,0.0,129472.9042,0.0,3.925812,...,0.253000,0.058441,0.46176,0.158275,0.052989,0.443259,72.881375,0.0,3.995912,Afghanistan
1,700,1946,2.0,0.0,34.346348,8.478997,0.0,129413.0225,0.0,10.000000,...,0.253000,0.000000,0.00000,0.158321,0.052663,1.000000,72.900089,0.0,1.000000,Afghanistan
2,700,1947,3.0,0.0,77.000000,8.481015,0.0,130431.0145,0.0,10.000000,...,0.253000,0.000000,0.00000,0.158425,0.052891,2.000000,72.962880,0.0,1.000000,Afghanistan
3,700,1948,4.0,0.0,78.000000,8.451628,0.0,126781.6866,0.0,10.000000,...,0.253000,0.000000,0.00000,0.159006,0.052902,2.000000,73.102449,0.0,1.000000,Afghanistan
4,700,1949,5.0,0.0,79.000000,8.500172,0.0,130979.2470,0.0,10.000000,...,0.253000,0.000000,0.00000,0.158074,0.052706,2.000000,72.850389,0.0,1.000000,Afghanistan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11419,552,1996,7136.0,0.0,31.000000,6.760000,0.0,46000.0000,0.0,6.000000,...,0.200000,0.058530,0.35749,0.175017,0.030631,0.000000,78.975510,0.0,3.000000,Zimbabwe
11420,552,1997,7137.0,0.0,32.000000,7.620000,0.0,46000.0000,0.0,6.000000,...,0.200000,0.058706,0.35749,0.194624,0.037878,0.000000,72.737373,0.0,3.000000,Zimbabwe
11421,552,1998,7138.0,0.0,33.000000,10.590000,0.0,46000.0000,0.0,6.000000,...,0.200000,0.058580,0.35749,0.158606,0.052775,0.000000,79.235222,0.0,3.000000,Zimbabwe
11422,552,1999,7139.0,0.0,34.000000,8.514664,0.0,46000.0000,-1.0,6.000000,...,0.200000,0.058705,0.35749,0.157868,0.052714,0.000000,72.858179,0.0,3.000000,Zimbabwe


In [38]:
final_missing_summary

Unnamed: 0,cowcode,country_name,missing_years_1945_2000,missing_year_ranges
0,31,Bahamas,28,1945–1972
1,51,Jamaica,17,1945–1961
2,52,Trinidad and Tobago,17,1945–1961
3,53,Barbados,21,1945–1965
4,55,Grenada,29,1945–1973
...,...,...,...,...
111,910,Papua New Guinea,30,1945–1974
112,935,Vanuatu,36,1945–1980
113,940,Solomon Islands,33,1945–1977
114,950,Fiji,25,1945–1969


In [39]:
final_missing_summary.to_csv('final_missing_years.csv')

## 🧭 Interpreting Missing Year Ranges in Civil War Dataset

### 🔍 Why Are There Missing Years for Some Countries?

Upon analyzing the `missing_year_ranges` column, we find that many countries have consistent blocks of missing data at either the **beginning (1945–1960s)** or **end (1990s–2000)** of the time period. This pattern aligns closely with **key historical transitions** such as:

* **Colonial independence** (gaining sovereignty in mid–late 20th century)
* **State dissolution or unification** (e.g. breakup of Yugoslavia, German reunification)
* **Geopolitical restructuring** (e.g. federal restructuring in Ethiopia, Yemen unification)


### 📜 Examples: Early Missing Years (Pre-Independence)

| Country             | Missing Years | Historical Context                        |
| ------------------- | ------------- | ----------------------------------------- |
| Bahamas             | 1945–1972     | Independence from the UK in 1973          |
| Jamaica             | 1945–1961     | Independence in 1962                      |
| Trinidad and Tobago | 1945–1961     | Independence in 1962                      |
| Belize              | 1945–1980     | Independence in 1981                      |
| Guyana              | 1945–1965     | Independence in 1966                      |
| Suriname            | 1945–1974     | Independence from the Netherlands in 1975 |


### 🧱 Examples: Late Missing Years (Post-State Dissolution)

| Country                 | Missing Years        | Historical Context                            |
| ----------------------- | -------------------- | --------------------------------------------- |
| Czechoslovakia          | 1993–2000            | Split into Czech Republic and Slovakia (1993) |
| German Federal Republic | 1991–2000            | German reunification (1990)                   |
| Yugoslavia              | 1991–2000            | Breakup into multiple states in early 1990s   |
| Ethiopia                | 1995–2000            | New federal constitution (1995)               |
| Yemen (North & South)   | 1945–1966, 1990–2000 | Unification in 1990                           |


### ✅ Conclusion

The observed missing years are **not random** — they reflect **genuine historical processes** such as:

* The country **did not exist in its modern form** yet (pre-independence)
* The country **ceased to exist or transformed politically**, making data inconsistent
* In some cases, **post-conflict reconstruction** may also delay data collection




In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

# 1. Clean and filter data
df_clean = merged_df.dropna(subset=["warstds"])                      # Ensure warstds is present
df_clean = df_clean[df_clean["year"].between(1945, 2000)]           # Time range
df_clean = df_clean.dropna()                                        # Drop rows with any NaNs

# 2. Time-aware split
train_df = df_clean[df_clean["year"] <= 1995]
test_df = df_clean[df_clean["year"] > 1995]

# 3. Define feature columns (exclude non-numeric or identifiers)
non_features = ["warstds", "country_name", "cowcode", "year"]
feature_cols = [col for col in train_df.columns if col not in non_features]

# 4. Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(train_df[feature_cols])
X_test = scaler.transform(test_df[feature_cols])

y_train = train_df["warstds"]
y_test = test_df["warstds"]

# 5. Fit Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# 6. Predict and Evaluate
y_pred = model.predict(X_test)

print("🔍 Classification Report:")
print(classification_report(y_test, y_pred))

print("🧱 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


🔍 Classification Report:
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00       945
         1.0       0.00      0.00      0.00         6

    accuracy                           0.99       951
   macro avg       0.50      0.50      0.50       951
weighted avg       0.99      0.99      0.99       951

🧱 Confusion Matrix:
[[945   0]
 [  6   0]]




In [44]:
# 0. Sort the dataset properly
merged_df = merged_df.sort_values(["country_name", "year"])

# 1. COUNTRY AGE — this will help us identify early years post-independence
merged_df["country_age"] = merged_df.groupby("country_name").cumcount()

# 2. LAG FEATURES (safe only after year 1)
lag_features = ["gdppc", "pop", "resource", "milper"]  # exclude warstds to avoid leakage
for col in lag_features:
    if col in merged_df.columns:
        merged_df[f"{col}_lag1"] = merged_df.groupby("country_name")[col].shift(1)
        merged_df[f"{col}_lag2"] = merged_df.groupby("country_name")[col].shift(2)

# 3. ROLLING AVERAGES (3-year backward-looking, only apply from year 2 onwards)
rolling_features = ["gdppc", "pop"]
for col in rolling_features:
    if col in merged_df.columns:
        merged_df[f"{col}_roll3"] = (
            merged_df.groupby("country_name")[col]
            .transform(lambda x: x.rolling(window=3, min_periods=1).mean())
        )

# 4. INTERACTION TERMS — OK to compute even for early years
if "gdppc" in merged_df.columns and "resource" in merged_df.columns:
    merged_df["gdppc_x_resource"] = merged_df["gdppc"] * merged_df["resource"]

# 5. POST-INDEPENDENCE FLAG
merged_df["post_independence"] = merged_df["country_age"] < 5

# 6. OPTIONAL: warstds_lag1 for derived features ONLY (never use warstds itself)
merged_df["warstds_lag1"] = merged_df.groupby("country_name")["warstds"].shift(1)

# years_since_war only from lag1 to avoid leakage
def years_since_last_war(lags):
    count = []
    last_war = None
    for val in lags:
        if val == 1:
            last_war = 0
        elif last_war is not None:
            last_war += 1
        count.append(last_war)
    return count

merged_df["years_since_war"] = (
    merged_df.groupby("country_name")["warstds_lag1"]
    .transform(years_since_last_war)
)

# 7. FILL NaNs ONLY in engineered features — NOT target
engineered_cols = [col for col in merged_df.columns if any(
    suffix in col for suffix in ["_lag1", "_lag2", "_roll3", "_x_", "country_age", "post_independence", "years_since_war"]
)]
merged_df[engineered_cols] = merged_df[engineered_cols].fillna(-1)  # Use -1 to preserve structure without dropping rows





In [42]:
merged_df

Unnamed: 0.1,cowcode,year,Unnamed: 0,warstds,ager,agexp,anoc,army85,autch98,auto4,...,trade,warhist,xconst,country_name,milper_lag1,milper_lag2,country_age,post_independence,warstds_lag1,years_since_war
0,700,1945,1.0,0.0,34.461765,8.510845,0.0,1.294729e+05,0.0,3.925812,...,72.881375,0.0,3.995912,Afghanistan,,,0.0,True,,
1,700,1946,2.0,0.0,34.346348,8.478997,0.0,1.294130e+05,0.0,10.000000,...,72.900089,0.0,1.000000,Afghanistan,121.087366,,1.0,True,0.0,
2,700,1947,3.0,0.0,77.000000,8.481015,0.0,1.304310e+05,0.0,10.000000,...,72.962880,0.0,1.000000,Afghanistan,121.885359,121.087366,2.0,True,0.0,
3,700,1948,4.0,0.0,78.000000,8.451628,0.0,1.267817e+05,0.0,10.000000,...,73.102449,0.0,1.000000,Afghanistan,122.780608,121.885359,3.0,True,0.0,
4,700,1949,5.0,0.0,79.000000,8.500172,0.0,1.309792e+05,0.0,10.000000,...,72.850389,0.0,1.000000,Afghanistan,118.256427,122.780608,4.0,True,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10302,364,1999,,,,,,,,,...,,,,,,,,False,,
10862,818,1999,6867.0,0.0,23.000000,8.431444,0.0,1.027000e+06,0.0,7.000000,...,71.610084,1.0,3.000000,,,,,False,,
3471,529,2000,2021.0,0.0,130.000000,8.548434,0.0,2.400000e+05,0.0,2.000000,...,72.801208,0.0,3.000000,,,,,False,,
10303,364,2000,,,,,,,,,...,,,,,,,,False,,


In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

# 1. Clean and filter data
df_clean = merged_df.dropna(subset=["warstds"])                      # Ensure warstds is present
df_clean = df_clean[df_clean["year"].between(1945, 2000)]           # Time range
df_clean = df_clean.dropna()                                        # Drop rows with any NaNs

# 2. Time-aware split
train_df = df_clean[df_clean["year"] <= 1995]
test_df = df_clean[df_clean["year"] > 1995]

# 3. Define feature columns (exclude non-numeric or identifiers)
non_features = ["warstds", "country_name", "cowcode", "year"]
feature_cols = [col for col in train_df.columns if col not in non_features]

# 4. Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(train_df[feature_cols])
X_test = scaler.transform(test_df[feature_cols])

y_train = train_df["warstds"]
y_test = test_df["warstds"]

# 5. Fit Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# 6. Predict and Evaluate
y_pred = model.predict(X_test)

print("🔍 Classification Report:")
print(classification_report(y_test, y_pred))

print("🧱 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

🔍 Classification Report:
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00       945
         1.0       0.00      0.00      0.00         6

    accuracy                           0.99       951
   macro avg       0.50      0.50      0.50       951
weighted avg       0.99      0.99      0.99       951

🧱 Confusion Matrix:
[[945   0]
 [  6   0]]




In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import pandas as pd

# 1. Split by time — as before
train_df = merged_df[merged_df["year"] <= 1995]
test_df = merged_df[merged_df["year"] > 1995]

# 2. Drop rows with missing target
train_df = train_df.dropna(subset=["warstds"])
test_df = test_df.dropna(subset=["warstds"])

# 3. Downsample majority class in training data
df_majority = train_df[train_df.warstds == 0]
df_minority = train_df[train_df.warstds == 1]

# Match number of 0s to 1s (can also try 2x, 3x if you want slight imbalance)
df_majority_downsampled = df_majority.sample(n=len(df_minority), random_state=42)
train_balanced = pd.concat([df_majority_downsampled, df_minority])

# 4. Select features (exclude IDs, target, year)
non_features = ["warstds", "country_name", "cowcode", "year"]
feature_cols = [col for col in train_balanced.columns if col not in non_features]

# 5. Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(train_balanced[feature_cols])
y_train = train_balanced["warstds"]

X_test = scaler.transform(test_df[feature_cols])
y_test = test_df["warstds"]

# 6. Logistic Regression with L1 (Lasso) regularization
model = LogisticRegression(penalty='l1', solver='liblinear', max_iter=1000)
model.fit(X_train, y_train)

# 7. Predict and evaluate
y_pred = model.predict(X_test)

print("📊 Classification Report:")
print(classification_report(y_test, y_pred))

print("🧱 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# 8. Optional: See selected features
selected_features = pd.Series(model.coef_[0], index=feature_cols)
print("\n📌 Selected Features (non-zero coefficients):")
print(selected_features[selected_features != 0].sort_values(ascending=False))



📊 Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      0.85      0.92       955
         1.0       0.04      1.00      0.08         6

    accuracy                           0.85       961
   macro avg       0.52      0.92      0.50       961
weighted avg       0.99      0.85      0.91       961

🧱 Confusion Matrix:
[[808 147]
 [  0   6]]

📌 Selected Features (non-zero coefficients):
sxpnew               1.185717
ln_gdpen             0.664192
relfrac              0.663264
ncontig              0.627774
inst3                0.581452
dlang                0.521127
illiteracy           0.472204
partfree             0.442699
nat_war              0.386834
Unnamed: 0           0.344382
incumb               0.323090
durable              0.263809
milper               0.254720
centpol3             0.248189
etdo4590             0.225063
nmdp4_alt            0.222532
demch98              0.202159
proxregc             0.142141
inst               