In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Layer
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Multiply
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load dataset
file_path = "anes/anes_timeseries_2024_csv_20250219.csv"  # Change if necessary
df = pd.read_csv(file_path, low_memory=False)

# Select the target variable: "Is R registered to vote?" (V241012)
target = "V241012"

# Remove rows with missing/invalid responses in the target (-1, -8, -9 indicate missing/refused)
df_filtered = df[~df[target].isin([-1, -8, -9])]

# Convert target to binary (1 = Registered, 0 = Not Registered)
df_filtered[target] = df_filtered[target].map({1: 1, 2: 0})

# Drop non-informative columns (IDs, version, sample type, weights)
drop_cols = ["version", "V240001", "V200001", "V160001_orig", "V240002", "V240003"] + \
            [col for col in df_filtered.columns if "_a" in col or "_c" in col or "_d" in col]  # Drop weight variables

df_filtered.drop(columns=[col for col in drop_cols if col in df_filtered.columns], inplace=True)

# Identify categorical and numerical columns
categorical_cols = df_filtered.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df_filtered.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_filtered[col] = le.fit_transform(df_filtered[col].astype(str))
    label_encoders[col] = le

# Handle missing values using median imputation
imputer = SimpleImputer(strategy="median")
df_filtered[numerical_cols] = imputer.fit_transform(df_filtered[numerical_cols])

# Define features (X) and target (y)
X = df_filtered.drop(columns=[target])
y = df_filtered[target]

# Split into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define an Attention-Based Model for Feature Selection
num_features = X_train.shape[1]
inputs = Input(shape=(num_features,))

# Learn feature importance weights directly
attention_weights = Dense(num_features, activation="softmax", name="attention_weights")(inputs)
attention_output = Multiply()([inputs, attention_weights])  # Multiply features by their learned importance

# Feed attention-weighted features into a prediction model
x = Dense(64, activation="relu")(attention_output)
x = Dropout(0.3)(x)
x = Dense(32, activation="relu")(x)
outputs = Dense(1, activation="sigmoid")(x)  # Binary classification

# Compile the Model
model = Model(inputs, outputs)
model.compile(optimizer=Adam(learning_rate=0.001), loss="binary_crossentropy", metrics=["accuracy"])

# Train the Model
model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

# Extract attention weights
attention_model = Model(inputs=model.input, outputs=model.get_layer("attention_weights").output)
attention_scores = attention_model.predict(X_train_scaled)

# Compute average attention scores for each feature
avg_attention_scores = np.mean(attention_scores, axis=0)

# Ensure the scores match the number of features
if len(avg_attention_scores) != len(X_train.columns):
    raise ValueError(f"Mismatch: Attention scores ({len(avg_attention_scores)}) vs Features ({len(X_train.columns)})")

# Create a DataFrame with feature importance based on attention scores
attention_feature_importance = pd.DataFrame({
    "Feature": X_train.columns,
    "Attention Score": avg_attention_scores
}).sort_values(by="Attention Score", ascending=False).head(20)

# Print the selected features based on attention scores
print("\n===== Attention-Based Feature Selection Results =====")
print(f"Total Original Features: {X_train.shape[1]}")
print(f"Top 20 Features with Highest Attention Scores:\n")
print(attention_feature_importance.to_string(index=False))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[target] = df_filtered[target].map({1: 1, 2: 0})
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.drop(columns=[col for col in drop_cols if col in df_filtered.columns], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[col] = le.fit_transform(df_filtered[col].astype(str))
A value is trying to be set on a copy o

Epoch 1/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - accuracy: 0.8672 - loss: 0.6099 - val_accuracy: 0.9189 - val_loss: 0.2564
Epoch 2/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.9048 - loss: 0.2394 - val_accuracy: 0.9655 - val_loss: 0.1666
Epoch 3/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.9668 - loss: 0.0986 - val_accuracy: 0.9696 - val_loss: 0.1712
Epoch 4/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.9936 - loss: 0.0388 - val_accuracy: 0.9675 - val_loss: 0.2008
Epoch 5/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.9983 - loss: 0.0143 - val_accuracy: 0.9675 - val_loss: 0.2303
Epoch 6/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.9973 - loss: 0.0132 - val_accuracy: 0.9675 - val_loss: 0.2473
Epoch 7/10
[1m62/62[0m [32m━━━━