In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample

In [3]:
data = pd.read_csv('numeric_data.csv')

# Define categorical columns (all columns except the target column)
target_column = 'Harm_vs_benifits'  
categorical_columns = [col for col in data.columns if col != target_column]

In [4]:
# One-hot encoding for categorical columns
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_data = encoder.fit_transform(data[categorical_columns])  # Specify your categorical columns
encoded_columns = encoder.get_feature_names_out(categorical_columns)
encoded_df = pd.DataFrame(encoded_data, columns=encoded_columns)



In [5]:
# Combine with the rest of the dataset
data_encoded = pd.concat([data.drop(categorical_columns + [target_column], axis=1), encoded_df], axis=1)



In [6]:
# Label encoding for the target variable
label_encoder = LabelEncoder()
data_encoded['Target_Encoded'] = label_encoder.fit_transform(data[target_column])

# Balancing the dataset
class_distribution = data_encoded['Target_Encoded'].value_counts()
max_size = class_distribution.max()

In [7]:
data_encoded

Unnamed: 0,Device_type_0,Device_type_1,Device_type_2,Language_0,Language_1,Smartphone_user_0,Smartphone_user_1,Smartphone_user_2,Social_media_user_0,Social_media_user_1,...,P_income_250000,P_employment_type_0,P_employment_type_1,P_employment_type_2,P_employment_type_3,P_neighborhood_0,P_neighborhood_1,P_neighborhood_2,P_neighborhood_3,Target_Encoded
0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
1,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1
2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
3,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0
4,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3635,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0
3636,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0
3637,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0
3638,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0


In [8]:
balanced_data = pd.DataFrame()
for cls in data_encoded['Target_Encoded'].unique():
    class_subset = resample(data_encoded[data_encoded['Target_Encoded'] == cls],
                            replace=True, 
                            n_samples=max_size, 
                            random_state=42)
    balanced_data = pd.concat([balanced_data, class_subset], axis=0)

# Splitting the dataset
X = balanced_data.drop('Target_Encoded', axis=1)
y = balanced_data['Target_Encoded']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
X

Unnamed: 0,Device_type_0,Device_type_1,Device_type_2,Language_0,Language_1,Smartphone_user_0,Smartphone_user_1,Smartphone_user_2,Social_media_user_0,Social_media_user_1,...,P_income_175000,P_income_250000,P_employment_type_0,P_employment_type_1,P_employment_type_2,P_employment_type_3,P_neighborhood_0,P_neighborhood_1,P_neighborhood_2,P_neighborhood_3
1142,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1768,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1532,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1493,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2238,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3472,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3329,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3224,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1547,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [10]:
balanced_data

Unnamed: 0,Device_type_0,Device_type_1,Device_type_2,Language_0,Language_1,Smartphone_user_0,Smartphone_user_1,Smartphone_user_2,Social_media_user_0,Social_media_user_1,...,P_income_250000,P_employment_type_0,P_employment_type_1,P_employment_type_2,P_employment_type_3,P_neighborhood_0,P_neighborhood_1,P_neighborhood_2,P_neighborhood_3,Target_Encoded
1142,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0
1768,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0
1532,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0
1493,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0
2238,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3472,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2
3329,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2
3224,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2
1547,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2


In [11]:
X_train

Unnamed: 0,Device_type_0,Device_type_1,Device_type_2,Language_0,Language_1,Smartphone_user_0,Smartphone_user_1,Smartphone_user_2,Social_media_user_0,Social_media_user_1,...,P_income_175000,P_income_250000,P_employment_type_0,P_employment_type_1,P_employment_type_2,P_employment_type_3,P_neighborhood_0,P_neighborhood_1,P_neighborhood_2,P_neighborhood_3
1721,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2807,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
75,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3119,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
432,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3227,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1547,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1261,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3563,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [12]:
# Model training and evaluation
rf_classifier = RandomForestClassifier(random_state=42)
gb_classifier = GradientBoostingClassifier(random_state=42)

rf_classifier.fit(X_train, y_train)
gb_classifier.fit(X_train, y_train)



In [13]:
# validation
y_pred_rf = rf_classifier.predict(X_val)
y_pred_gb = gb_classifier.predict(X_val)

accuracy_rf = accuracy_score(y_val, y_pred_rf)
accuracy_gb = accuracy_score(y_val, y_pred_gb)


In [14]:
# performance
print("Random Forest Accuracy:", accuracy_rf)
print("Gradient Boosting Accuracy:", accuracy_gb)
print("\nRandom Forest Classification Report:\n", classification_report(y_val, y_pred_rf))
print("\nGradient Boosting Classification Report:\n", classification_report(y_val, y_pred_gb))

Random Forest Accuracy: 0.9699248120300752
Gradient Boosting Accuracy: 0.8571428571428571

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.94      0.96       565
           1       0.94      0.97      0.95       502
           2       1.00      1.00      1.00       529

    accuracy                           0.97      1596
   macro avg       0.97      0.97      0.97      1596
weighted avg       0.97      0.97      0.97      1596


Gradient Boosting Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.76      0.80       565
           1       0.77      0.81      0.79       502
           2       0.96      1.00      0.98       529

    accuracy                           0.86      1596
   macro avg       0.86      0.86      0.86      1596
weighted avg       0.86      0.86      0.86      1596

