In [11]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

# Read the CSV file
file_path = '/Users/allenyang/Downloads/project-clean-data.csv'
data = pd.read_csv(file_path)

# Remove rows with NaN values in any relevant columns
data_cleaned = data.dropna(subset=['F_RACECMB', 'F_EDUCCAT2', 'F_AGECAT', 'F_INC_SDT1', 'F_CREGION', 'F_RELIG', 'F_PARTY_FINAL', 'Social Media'])

# Selecting relevant columns and renaming them
data_cleaned = data_cleaned[['F_RACECMB', 'F_EDUCCAT2', 'F_AGECAT', 'F_INC_SDT1', 'F_CREGION', 'F_RELIG', 'F_PARTY_FINAL', 'Social Media']]
data_cleaned.columns = ['Race', 'Education', 'Age', 'Income', 'Region', 'Religion', 'Party', 'Attitude']

# Mapping target variable
attitude_mapping = {'Bad idea for society': 0, 'Good idea for society': 1, 'Neither good nor bad': 2}
data_cleaned['Attitude'] = data_cleaned['Attitude'].map(attitude_mapping)

# Ensure no NaN values exist in the cleaned data
data_cleaned = data_cleaned.dropna()

# Encode categorical variables
label_encoders = {}
for column in ['Race', 'Education', 'Age', 'Income', 'Region', 'Religion', 'Party']:
    le = LabelEncoder()
    data_cleaned[column] = le.fit_transform(data_cleaned[column])
    label_encoders[column] = le

# Splitting features and target
X = data_cleaned.drop('Attitude', axis=1)
y = data_cleaned['Attitude']

# Ensure X and y have consistent length
X = X.loc[y.index]

# Train a Decision Tree Classifier
clf = DecisionTreeClassifier(criterion='entropy')
clf.fit(X, y)

# Use feature importances from the decision tree to rank features
importances = clf.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print all features ranked by their importance
print("Feature rankings in descending order of influence:")
print(feature_importance_df)

# Select the top 3 features
top_3_features = feature_importance_df['Feature'].head(3).tolist()

print("Top 3 features:", top_3_features)


Feature rankings in descending order of influence:
     Feature  Importance
3     Income    0.210101
6      Party    0.165702
5   Religion    0.149855
4     Region    0.145295
1  Education    0.136898
2        Age    0.126526
0       Race    0.065624
Top 3 features: ['Income', 'Party', 'Religion']
