In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import chi2, SelectKBest


In [3]:
# Load dataset
file_path = "anes/anes_timeseries_2024_csv_20250219.csv"  # Change the path if necessary
df = pd.read_csv(file_path, low_memory=False)

# Select the target variable: "Is R registered to vote?" (V241012)
target = "V241012"

# Remove rows with missing/invalid responses in the target (-1, -8, -9 indicate missing/refused)
df_filtered = df[~df[target].isin([-1, -8, -9])]

# Convert target to binary (1 = Registered, 0 = Not Registered)
df_filtered[target] = df_filtered[target].map({1: 1, 2: 0})

# Drop non-informative columns (IDs, version, sample type, weights)
drop_cols = ["version", "V240001", "V200001", "V160001_orig", "V240002", "V240003"] + \
            [col for col in df_filtered.columns if "_a" in col or "_c" in col or "_d" in col]  # Drop weight variables

df_filtered.drop(columns=[col for col in drop_cols if col in df_filtered.columns], inplace=True)

# Identify categorical and numerical columns
categorical_cols = df_filtered.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df_filtered.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_filtered[col] = le.fit_transform(df_filtered[col].astype(str))
    label_encoders[col] = le

# Handle missing values using median imputation
imputer = SimpleImputer(strategy="median")
df_filtered[numerical_cols] = imputer.fit_transform(df_filtered[numerical_cols])

# Define features (X) and target (y)
X = df_filtered.drop(columns=[target])
y = df_filtered[target]

# Split into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Ensure features are non-negative for Chi-Square test (Chi2 requires positive values)
X_train_positive = X_train.copy()

# Shift all values to be positive (add absolute min value + 1 if needed)
for col in X_train_positive.columns:
    min_val = X_train_positive[col].min()
    if min_val < 0:
        X_train_positive[col] += abs(min_val) + 1

# Apply Chi-Square test to select the top 20 features
chi2_selector = SelectKBest(score_func=chi2, k=20)
chi2_selector.fit(X_train_positive, y_train)

# Get selected features
selected_chi2_features = X_train_positive.columns[chi2_selector.get_support()]

# Create DataFrame with chi-square scores
chi2_scores = pd.DataFrame({
    "Feature": X_train_positive.columns,
    "Chi2 Score": chi2_selector.scores_
}).sort_values(by="Chi2 Score", ascending=False).head(20)

# Print results
print("\n===== Chi-Square Feature Selection Results =====")
print(f"Total Features Before Selection: {len(X_train.columns)}")
print(f"Total Features After Selection: {len(selected_chi2_features)}\n")

print("Top 20 Selected Features (Based on Chi-Square Score):")
print(chi2_scores.to_string(index=False))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[target] = df_filtered[target].map({1: 1, 2: 0})
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.drop(columns=[col for col in drop_cols if col in df_filtered.columns], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[col] = le.fit_transform(df_filtered[col].astype(str))
A value is trying to be set on a copy o


===== Chi-Square Feature Selection Results =====
Total Features Before Selection: 701
Total Features After Selection: 20

Top 20 Selected Features (Based on Chi-Square Score):
 Feature    Chi2 Score
 V241569 510518.422668
 V241490  50360.716551
V240105c   9656.591199
 V241177   3818.558211
 V241239   2330.666660
 V241242   2120.827967
 V241258   1835.995802
V241076x   1602.090424
 V241397   1493.941061
 V241248   1479.204453
V240101a   1321.300042
 V241252   1317.856959
V240102d   1183.832918
V241077x   1116.133300
V240104a   1091.127613
V240104d   1090.729855
 V241245   1039.893753
 V241255    953.711403
V240104c    800.809383
V240102a    739.652104
