In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

In [3]:
# Load dataset
file_path = "anes/anes_timeseries_2024_csv_20250219.csv"  # Change if necessary
df = pd.read_csv(file_path, low_memory=False)

# Select the target variable: "Is R registered to vote?" (V241012)
target = "V241012"

# Remove rows with missing/invalid responses in the target (-1, -8, -9 indicate missing/refused)
df_filtered = df[~df[target].isin([-1, -8, -9])]

# Convert target to binary (1 = Registered, 0 = Not Registered)
df_filtered[target] = df_filtered[target].map({1: 1, 2: 0})

# Drop non-informative columns (IDs, version, sample type, weights)
drop_cols = ["version", "V240001", "V200001", "V160001_orig", "V240002", "V240003"] + \
            [col for col in df_filtered.columns if "_a" in col or "_c" in col or "_d" in col]  # Drop weight variables

df_filtered.drop(columns=[col for col in drop_cols if col in df_filtered.columns], inplace=True)

# Identify categorical and numerical columns
categorical_cols = df_filtered.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df_filtered.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_filtered[col] = le.fit_transform(df_filtered[col].astype(str))
    label_encoders[col] = le

# Handle missing values using median imputation
imputer = SimpleImputer(strategy="median")
df_filtered[numerical_cols] = imputer.fit_transform(df_filtered[numerical_cols])

# Define features (X) and target (y)
X = df_filtered.drop(columns=[target])
y = df_filtered[target]

# Split into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 1: Reduce features using Random Forest to select the top 100 important ones
rf_model = RandomForestClassifier(n_estimators=50, random_state=42, class_weight="balanced", n_jobs=-1)
rf_model.fit(X_train, y_train)

# Get feature importance and select the top 100
feature_importances = pd.DataFrame({"Feature": X_train.columns, "Importance": rf_model.feature_importances_})
top_100_features = feature_importances.sort_values(by="Importance", ascending=False).head(100)["Feature"].tolist()

# Reduce X_train and X_test to these selected features
X_train_reduced = X_train[top_100_features]
X_test_reduced = X_test[top_100_features]

# Step 2: Apply RFE using a logistic regression model on the reduced feature set
log_reg = LogisticRegression(max_iter=500, class_weight="balanced", solver="liblinear", random_state=42)
rfe = RFE(estimator=log_reg, n_features_to_select=20)  # Select the top 20 features
rfe.fit(X_train_reduced, y_train)

# Get selected features
selected_features = X_train_reduced.columns[rfe.support_]

# Calculate number of removed variables
num_removed = len(X_train.columns) - len(selected_features)
num_remaining = len(selected_features)

# Print results
print("\n===== RFE Feature Selection Results =====")
print(f"Total Original Features: {len(X_train.columns)}")
print(f"Total Features After Random Forest Selection: {len(X_train_reduced.columns)}")
print(f"Total Features After RFE Selection: {num_remaining}")
print(f"Number of Removed Features: {num_removed}")
print("\nSelected Features (Top 20):")
print(selected_features.tolist())



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[target] = df_filtered[target].map({1: 1, 2: 0})
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.drop(columns=[col for col in drop_cols if col in df_filtered.columns], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[col] = le.fit_transform(df_filtered[col].astype(str))
A value is trying to be set on a copy o


===== RFE Feature Selection Results =====
Total Original Features: 701
Total Features After Random Forest Selection: 100
Total Features After RFE Selection: 20
Number of Removed Features: 681

Selected Features (Top 20):
['V241030', 'V241103', 'V241106x', 'V241102x', 'V241108', 'V241025', 'V241104', 'V241210', 'V241107', 'V241005', 'V241216', 'V241215', 'V241400x', 'V241044', 'V241326', 'V241048', 'V241046', 'V241033', 'V241297x', 'V241222']
