In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample
from joblib import dump

In [26]:
data = pd.read_csv('numeric_data.csv')

# Define categorical columns (all columns except the target column)
target_column = 'How_is_curret_days_parenting'  

categorical_columns = ['Time_spent_together' , 'ACCEPT_own_smartphone', 'ML_learn_social_skills', 'ML_do_well_in_school', 'Feel_pressurised_to_share', 'ADV_socialmedia','ADV_online_blogs','ADV_other_parents','How_often_youtube','CO_online_bully','HCHU_smartphone','MNT_websites','MNT_take_away_mobile','MNT_screentime','Child_first_age_smartphone','RS_to_contact']

In [27]:
# One-hot encoding for categorical columns
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_data = encoder.fit_transform(data[categorical_columns])  # Specify your categorical columns
encoded_columns = encoder.get_feature_names_out(categorical_columns)
encoded_df = pd.DataFrame(encoded_data, columns=encoded_columns)



In [28]:
# Combine with the rest of the dataset
data_encoded = encoded_df



In [29]:
# Label encoding for the target variable
label_encoder = LabelEncoder()
data_encoded['Target_Encoded'] = label_encoder.fit_transform(data[target_column])

# Balancing the dataset
class_distribution = data_encoded['Target_Encoded'].value_counts()
max_size = class_distribution.max()

In [30]:
data_encoded

Unnamed: 0,Time_spent_together_0,Time_spent_together_1,Time_spent_together_2,Time_spent_together_3,Time_spent_together_4,ACCEPT_own_smartphone_0,ACCEPT_own_smartphone_1,ACCEPT_own_smartphone_2,ACCEPT_own_smartphone_3,ACCEPT_own_smartphone_4,...,Child_first_age_smartphone_1,Child_first_age_smartphone_2,Child_first_age_smartphone_3,Child_first_age_smartphone_4,Child_first_age_smartphone_5,RS_to_contact_0,RS_to_contact_1,RS_to_contact_2,RS_to_contact_3,Target_Encoded
0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
2,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3635,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0
3636,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
3637,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0
3638,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0


In [31]:
balanced_data = pd.DataFrame()
for cls in data_encoded['Target_Encoded'].unique():
    class_subset = resample(data_encoded[data_encoded['Target_Encoded'] == cls],
                            replace=True, 
                            n_samples=max_size, 
                            random_state=42)
    balanced_data = pd.concat([balanced_data, class_subset], axis=0)

# Splitting the dataset
X = balanced_data.drop('Target_Encoded', axis=1)
y = balanced_data['Target_Encoded']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [32]:
X

Unnamed: 0,Time_spent_together_0,Time_spent_together_1,Time_spent_together_2,Time_spent_together_3,Time_spent_together_4,ACCEPT_own_smartphone_0,ACCEPT_own_smartphone_1,ACCEPT_own_smartphone_2,ACCEPT_own_smartphone_3,ACCEPT_own_smartphone_4,...,Child_first_age_smartphone_0,Child_first_age_smartphone_1,Child_first_age_smartphone_2,Child_first_age_smartphone_3,Child_first_age_smartphone_4,Child_first_age_smartphone_5,RS_to_contact_0,RS_to_contact_1,RS_to_contact_2,RS_to_contact_3
1329,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1943,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1711,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1659,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2440,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3582,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3615,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3362,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [33]:
balanced_data

Unnamed: 0,Time_spent_together_0,Time_spent_together_1,Time_spent_together_2,Time_spent_together_3,Time_spent_together_4,ACCEPT_own_smartphone_0,ACCEPT_own_smartphone_1,ACCEPT_own_smartphone_2,ACCEPT_own_smartphone_3,ACCEPT_own_smartphone_4,...,Child_first_age_smartphone_1,Child_first_age_smartphone_2,Child_first_age_smartphone_3,Child_first_age_smartphone_4,Child_first_age_smartphone_5,RS_to_contact_0,RS_to_contact_1,RS_to_contact_2,RS_to_contact_3,Target_Encoded
1329,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
1943,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
1711,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
1659,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0
2440,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3
3582,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3
3615,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
3362,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3


In [34]:
X_train

Unnamed: 0,Time_spent_together_0,Time_spent_together_1,Time_spent_together_2,Time_spent_together_3,Time_spent_together_4,ACCEPT_own_smartphone_0,ACCEPT_own_smartphone_1,ACCEPT_own_smartphone_2,ACCEPT_own_smartphone_3,ACCEPT_own_smartphone_4,...,Child_first_age_smartphone_0,Child_first_age_smartphone_1,Child_first_age_smartphone_2,Child_first_age_smartphone_3,Child_first_age_smartphone_4,Child_first_age_smartphone_5,RS_to_contact_0,RS_to_contact_1,RS_to_contact_2,RS_to_contact_3
2113,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
146,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1452,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3334,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3371,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3257,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
956,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2700,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
588,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [35]:
# Model training and evaluation
rf_classifier = RandomForestClassifier(random_state=42)
gb_classifier = GradientBoostingClassifier(random_state=42)

rf_classifier.fit(X_train, y_train)
gb_classifier.fit(X_train, y_train)


dump(rf_classifier, './model/random_forest_model_15.joblib')
dump(gb_classifier, './model/gradient_boosting_model_15.joblib')

['./model/gradient_boosting_model_15.joblib']

In [36]:
# validation
y_pred_rf = rf_classifier.predict(X_val)
y_pred_gb = gb_classifier.predict(X_val)

accuracy_rf = accuracy_score(y_val, y_pred_rf)
accuracy_gb = accuracy_score(y_val, y_pred_gb)


In [37]:
# performance
print("Random Forest Accuracy:", accuracy_rf)
print("Gradient Boosting Accuracy:", accuracy_gb)
print("\nRandom Forest Classification Report:\n", classification_report(y_val, y_pred_rf))
print("\nGradient Boosting Classification Report:\n", classification_report(y_val, y_pred_gb))

Random Forest Accuracy: 0.9260759493670886
Gradient Boosting Accuracy: 0.6941772151898734

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.81      0.85       497
           1       0.84      0.90      0.87       498
           2       0.98      1.00      0.99       483
           3       1.00      1.00      1.00       497

    accuracy                           0.93      1975
   macro avg       0.93      0.93      0.93      1975
weighted avg       0.93      0.93      0.93      1975


Gradient Boosting Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.53      0.56       497
           1       0.58      0.54      0.56       498
           2       0.64      0.71      0.67       483
           3       0.93      1.00      0.97       497

    accuracy                           0.69      1975
   macro avg       0.69      0.69      0.69      1975
weighted avg 