In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample

In [36]:
data = pd.read_csv('numeric_data.csv')

# Define categorical columns (all columns except the target column)
target_column = 'How_is_curret_days_parenting'  # Replace with your target column name
categorical_columns = [col for col in data.columns if col != target_column]

In [37]:
# One-hot encoding for categorical columns
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_data = encoder.fit_transform(data[categorical_columns])  # Specify your categorical columns
encoded_columns = encoder.get_feature_names_out(categorical_columns)
encoded_df = pd.DataFrame(encoded_data, columns=encoded_columns)



In [41]:
# Combine with the rest of the dataset
data_encoded = pd.concat([data.drop(categorical_columns + [target_column], axis=1), encoded_df], axis=1)



In [43]:
# Label encoding for the target variable
label_encoder = LabelEncoder()
data_encoded['Target_Encoded'] = label_encoder.fit_transform(data[target_column])

# Balancing the dataset
class_distribution = data_encoded['Target_Encoded'].value_counts()
max_size = class_distribution.max()

In [44]:
data_encoded

Unnamed: 0,Device_type_0,Device_type_1,Device_type_2,Language_0,Language_1,Smartphone_user_0,Smartphone_user_1,Smartphone_user_2,Social_media_user_0,Social_media_user_1,...,P_income_250000,P_employment_type_0,P_employment_type_1,P_employment_type_2,P_employment_type_3,P_neighborhood_0,P_neighborhood_1,P_neighborhood_2,P_neighborhood_3,Target_Encoded
0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
1,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0
2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
3,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1
4,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3635,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0
3636,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0
3637,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0
3638,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0


In [45]:
balanced_data = pd.DataFrame()
for cls in data_encoded['Target_Encoded'].unique():
    class_subset = resample(data_encoded[data_encoded['Target_Encoded'] == cls],
                            replace=True, 
                            n_samples=max_size, 
                            random_state=42)
    balanced_data = pd.concat([balanced_data, class_subset], axis=0)

# Splitting the dataset
X = balanced_data.drop('Target_Encoded', axis=1)
y = balanced_data['Target_Encoded']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [46]:
X

Unnamed: 0,Device_type_0,Device_type_1,Device_type_2,Language_0,Language_1,Smartphone_user_0,Smartphone_user_1,Smartphone_user_2,Social_media_user_0,Social_media_user_1,...,P_income_175000,P_income_250000,P_employment_type_0,P_employment_type_1,P_employment_type_2,P_employment_type_3,P_neighborhood_0,P_neighborhood_1,P_neighborhood_2,P_neighborhood_3
1329,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1943,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1711,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1659,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2440,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3582,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3615,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3362,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [47]:
balanced_data

Unnamed: 0,Device_type_0,Device_type_1,Device_type_2,Language_0,Language_1,Smartphone_user_0,Smartphone_user_1,Smartphone_user_2,Social_media_user_0,Social_media_user_1,...,P_income_250000,P_employment_type_0,P_employment_type_1,P_employment_type_2,P_employment_type_3,P_neighborhood_0,P_neighborhood_1,P_neighborhood_2,P_neighborhood_3,Target_Encoded
1329,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
1943,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
1711,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0
1659,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0
2440,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,3
3582,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,3
3615,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,3
3362,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3


In [48]:
X_train

Unnamed: 0,Device_type_0,Device_type_1,Device_type_2,Language_0,Language_1,Smartphone_user_0,Smartphone_user_1,Smartphone_user_2,Social_media_user_0,Social_media_user_1,...,P_income_175000,P_income_250000,P_employment_type_0,P_employment_type_1,P_employment_type_2,P_employment_type_3,P_neighborhood_0,P_neighborhood_1,P_neighborhood_2,P_neighborhood_3
2113,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
146,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1452,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3334,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3371,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3257,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
956,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2700,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
588,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [49]:
# Model training and evaluation
rf_classifier = RandomForestClassifier(random_state=42)
gb_classifier = GradientBoostingClassifier(random_state=42)

rf_classifier.fit(X_train, y_train)
gb_classifier.fit(X_train, y_train)



In [50]:
# validation
y_pred_rf = rf_classifier.predict(X_val)
y_pred_gb = gb_classifier.predict(X_val)

accuracy_rf = accuracy_score(y_val, y_pred_rf)
accuracy_gb = accuracy_score(y_val, y_pred_gb)


In [51]:
# performance
print("Random Forest Accuracy:", accuracy_rf)
print("Gradient Boosting Accuracy:", accuracy_gb)
print("\nRandom Forest Classification Report:\n", classification_report(y_val, y_pred_rf))
print("\nGradient Boosting Classification Report:\n", classification_report(y_val, y_pred_gb))

Random Forest Accuracy: 0.9635443037974684
Gradient Boosting Accuracy: 0.8126582278481013

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.94      0.93       497
           1       0.94      0.92      0.93       498
           2       1.00      1.00      1.00       483
           3       1.00      1.00      1.00       497

    accuracy                           0.96      1975
   macro avg       0.96      0.96      0.96      1975
weighted avg       0.96      0.96      0.96      1975


Gradient Boosting Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.63      0.67       497
           1       0.69      0.71      0.70       498
           2       0.83      0.92      0.87       483
           3       1.00      1.00      1.00       497

    accuracy                           0.81      1975
   macro avg       0.81      0.81      0.81      1975
weighted avg 