In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample

data = pd.read_csv('numeric_data.csv')

# Define categorical columns (all columns except the target column)
target_column = 'How_is_curret_days_parenting'  # Replace with your target column name
categorical_columns = [col for col in data.columns if col != target_column]

# One-hot encoding for categorical columns
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_data = encoder.fit_transform(data[categorical_columns])  # Specify your categorical columns
encoded_columns = encoder.get_feature_names_out(categorical_columns)
encoded_df = pd.DataFrame(encoded_data, columns=encoded_columns)

data_encoded = pd.concat([data.drop(categorical_columns + [target_column], axis=1), encoded_df], axis=1)

# Label encoding for the target variable
label_encoder = LabelEncoder()
data_encoded['Target_Encoded'] = label_encoder.fit_transform(data[target_column])

# Balancing the dataset
class_distribution = data_encoded['Target_Encoded'].value_counts()
max_size = class_distribution.max()

balanced_data = pd.DataFrame()
for cls in data_encoded['Target_Encoded'].unique():
    class_subset = resample(data_encoded[data_encoded['Target_Encoded'] == cls],
                            replace=True, 
                            n_samples=max_size, 
                            random_state=42)
    balanced_data = pd.concat([balanced_data, class_subset], axis=0)

# Splitting the dataset
X = balanced_data.drop('Target_Encoded', axis=1)
y = balanced_data['Target_Encoded']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training and evaluation
rf_classifier = RandomForestClassifier(random_state=42)
gb_classifier = GradientBoostingClassifier(random_state=42)

rf_classifier.fit(X_train, y_train)
gb_classifier.fit(X_train, y_train)

# validation
y_pred_rf = rf_classifier.predict(X_val)
y_pred_gb = gb_classifier.predict(X_val)

accuracy_rf = accuracy_score(y_val, y_pred_rf)
accuracy_gb = accuracy_score(y_val, y_pred_gb)

# performance
print("Random Forest Accuracy:", accuracy_rf)
print("Gradient Boosting Accuracy:", accuracy_gb)
print("\nRandom Forest Classification Report:\n", classification_report(y_val, y_pred_rf))
print("\nGradient Boosting Classification Report:\n", classification_report(y_val, y_pred_gb))



Random Forest Accuracy: 0.9635443037974684
Gradient Boosting Accuracy: 0.8126582278481013

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.94      0.93       497
           1       0.94      0.92      0.93       498
           2       1.00      1.00      1.00       483
           3       1.00      1.00      1.00       497

    accuracy                           0.96      1975
   macro avg       0.96      0.96      0.96      1975
weighted avg       0.96      0.96      0.96      1975


Gradient Boosting Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.63      0.67       497
           1       0.69      0.71      0.70       498
           2       0.83      0.92      0.87       483
           3       1.00      1.00      1.00       497

    accuracy                           0.81      1975
   macro avg       0.81      0.81      0.81      1975
weighted avg 

In [2]:
# Dictionary to store accuracy without each feature
accuracy_without_feature_gb = {}
accuracy_without_feature_rf = {}

In [5]:
# Iterate over each feature
it_numbers = 0
for feature in X_train.columns:
    X_train_dropped = X_train.drop(feature, axis=1)
    X_test_dropped = X_val.drop(feature, axis=1)

    # Retrain the model
    rf_classifier = RandomForestClassifier(random_state=42)
    gb_classifier = GradientBoostingClassifier(random_state=42)
    rf_classifier.fit(X_train_dropped, y_train)
    gb_classifier.fit(X_train_dropped, y_train)
    y_pred_dropped_gb = gb_classifier.predict(X_test_dropped)
    y_pred_dropped_rf = rf_classifier.predict(X_test_dropped)

    # Store the accuracy
    accuracy_without_feature_gb[feature] = accuracy_score(y_val, y_pred_dropped_gb)
    accuracy_without_feature_gb[feature] = accuracy_score(y_val, y_pred_dropped_rf)
    
    if it_numbers % 20 == 0:
        print("iterate {} times.".format(it_numbers))
    
    it_numbers += 1


iterate 0 times.


In [None]:
print("\nAccuracy without each feature:")
for feature, accuracy in accuracy_without_feature_gb.items():
    print(f"{feature}: {accuracy}")
for feature, accuracy in accuracy_without_feature_rf.items():
    print(f"{feature}: {accuracy}")