In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA

# Load dataset
file_path = "anes/anes_timeseries_2024_csv_20250219.csv"  # Change if necessary
df = pd.read_csv(file_path, low_memory=False)

# Select the target variable: "Is R registered to vote?" (V241012)
target = "V241012"

# Remove rows with missing/invalid responses in the target (-1, -8, -9 indicate missing/refused)
df_filtered = df[~df[target].isin([-1, -8, -9])]

# Convert target to binary (1 = Registered, 0 = Not Registered)
df_filtered[target] = df_filtered[target].map({1: 1, 2: 0})

# Drop non-informative columns (IDs, version, sample type, weights)
drop_cols = ["version", "V240001", "V200001", "V160001_orig", "V240002", "V240003"] + \
            [col for col in df_filtered.columns if "_a" in col or "_c" in col or "_d" in col]  # Drop weight variables

df_filtered.drop(columns=[col for col in drop_cols if col in df_filtered.columns], inplace=True)

# Identify categorical and numerical columns
categorical_cols = df_filtered.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df_filtered.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_filtered[col] = le.fit_transform(df_filtered[col].astype(str))
    label_encoders[col] = le

# Handle missing values using median imputation
imputer = SimpleImputer(strategy="median")
df_filtered[numerical_cols] = imputer.fit_transform(df_filtered[numerical_cols])

# Define features (X) and target (y)
X = df_filtered.drop(columns=[target])
y = df_filtered[target]

# Split into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize features before applying PCA
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply PCA while retaining 95% of variance
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Extract the principal component loadings (i.e., how original features contribute to each PC)
loadings = pd.DataFrame(pca.components_.T, columns=[f"PC{i+1}" for i in range(pca.n_components_)], index=X_train.columns)

# Get top contributing features for PC1 (largest absolute loadings)
top_features_pc1 = loadings["PC1"].abs().sort_values(ascending=False).head(10)

# Print results
print("\n===== PCA Feature Reduction Results =====")
print(f"Total Original Features: {X_train.shape[1]}")
print(f"Total Features After PCA: {pca.n_components_}")
print(f"Explained Variance Retained: {sum(pca.explained_variance_ratio_) * 100:.2f}%\n")

print("Top Features Contributing to PC1:")
print(top_features_pc1.to_string())

#further contributing features can also be printed



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[target] = df_filtered[target].map({1: 1, 2: 0})
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.drop(columns=[col for col in drop_cols if col in df_filtered.columns], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[col] = le.fit_transform(df_filtered[col].astype(str))
A value is trying to be set on a copy o


===== PCA Feature Reduction Results =====
Total Original Features: 701
Total Features After PCA: 321
Explained Variance Retained: 95.03%

Top Features Contributing to PC1:
V241236     0.113842
V241237     0.112335
V241140x    0.111181
V241227x    0.110486
V241203     0.109607
V241137x    0.108550
V241143x    0.106567
V241202     0.105708
V241201     0.104793
V241200     0.103722
