In [None]:
#import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

The PsychoCovid dataset contains 1,175 records with 13 features analyzing behavioral and psychological changes during COVID-19. It includes demographic (gender) and behavioral attributes like time spent before/during the pandemic (`time_bp`, `time_dp`), travel time, ease of online transition, home environment, productivity, sleep balance, skill learning, family connection, relaxation, and personal time. The target variable (`prefer`) likely represents work-life preferences.

This dataset is suitable for ML classification with potential applications in predicting work-life preferences, mental well-being analysis, and behavioral shift analysis. It can be used to classify individuals based on their adaptation to remote work, identify stress levels and productivity changes, and assess overall lifestyle adjustments due to the pandemic.

In [None]:
#read the data file
PscyhoCovid = pd.read_csv("Pscyho_Covid.csv")

In [None]:
PscyhoCovid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1175 entries, 0 to 1174
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   gender         1175 non-null   object 
 1   time_bp        1175 non-null   int64  
 2   time_dp        1175 non-null   int64  
 3   travel_time    1175 non-null   float64
 4   easeof_online  1175 non-null   int64  
 5   home_env       1175 non-null   int64  
 6   prod_inc       1175 non-null   float64
 7   sleep_bal      1175 non-null   float64
 8   new_skill      1175 non-null   float64
 9   fam_connect    1175 non-null   float64
 10  relaxed        1175 non-null   float64
 11  self_time      1175 non-null   float64
 12  prefer         1175 non-null   object 
dtypes: float64(7), int64(4), object(2)
memory usage: 119.5+ KB


In [None]:
PscyhoCovid.head(20)

Unnamed: 0,gender,time_bp,time_dp,travel_time,easeof_online,home_env,prod_inc,sleep_bal,new_skill,fam_connect,relaxed,self_time,prefer
0,Male,7,5,0.5,3,3,0.0,0.0,0.5,1.0,-0.5,-0.5,Complete Physical Attendance
1,Male,7,11,0.5,4,2,-0.5,0.5,-1.0,1.0,1.0,1.0,Complete Physical Attendance
2,Male,7,7,1.5,2,2,1.0,0.0,0.5,0.5,0.5,0.5,Complete Physical Attendance
3,Male,7,7,1.5,3,1,0.0,1.0,0.5,0.0,-1.0,-0.5,Complete Physical Attendance
4,Female,7,7,1.5,2,2,0.0,0.0,0.0,0.0,0.5,0.0,Complete Physical Attendance
5,Male,5,7,0.5,4,4,-1.0,-1.0,0.0,-0.5,0.0,0.0,Complete Physical Attendance
6,Male,5,4,0.5,1,3,0.5,0.5,1.0,0.5,1.0,1.0,Work/study from home
7,Male,7,9,0.5,3,2,0.0,-0.5,-0.5,-1.0,-0.5,0.0,Complete Physical Attendance
8,Male,5,9,0.5,1,2,1.0,1.0,1.0,1.0,1.0,1.0,Work/study from home
9,Male,7,7,1.5,2,2,0.0,0.0,0.0,-1.0,0.0,0.0,Work/study from home


In [None]:
# Preprocessing
# Encode the target variable
le = LabelEncoder()
PscyhoCovid['prefer'] = le.fit_transform(PscyhoCovid['prefer'])

In [None]:
PscyhoCovid.head(10)

Unnamed: 0,gender,time_bp,time_dp,travel_time,easeof_online,home_env,prod_inc,sleep_bal,new_skill,fam_connect,relaxed,self_time,prefer
0,Male,7,5,0.5,3,3,0.0,0.0,0.5,1.0,-0.5,-0.5,0
1,Male,7,11,0.5,4,2,-0.5,0.5,-1.0,1.0,1.0,1.0,0
2,Male,7,7,1.5,2,2,1.0,0.0,0.5,0.5,0.5,0.5,0
3,Male,7,7,1.5,3,1,0.0,1.0,0.5,0.0,-1.0,-0.5,0
4,Female,7,7,1.5,2,2,0.0,0.0,0.0,0.0,0.5,0.0,0
5,Male,5,7,0.5,4,4,-1.0,-1.0,0.0,-0.5,0.0,0.0,0
6,Male,5,4,0.5,1,3,0.5,0.5,1.0,0.5,1.0,1.0,1
7,Male,7,9,0.5,3,2,0.0,-0.5,-0.5,-1.0,-0.5,0.0,0
8,Male,5,9,0.5,1,2,1.0,1.0,1.0,1.0,1.0,1.0,1
9,Male,7,7,1.5,2,2,0.0,0.0,0.0,-1.0,0.0,0.0,1


In [None]:
PscyhoCovid['prefer'].value_counts()

prefer
0    836
1    339
Name: count, dtype: int64

In [None]:
PscyhoCovid['gender'].value_counts()

gender
Male                 649
Female               518
Prefer not to say      8
Name: count, dtype: int64

In [None]:
#create dummies
PscyhoCovid = pd.get_dummies(PscyhoCovid, drop_first = True)
print(PscyhoCovid.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1175 entries, 0 to 1174
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   time_bp                   1175 non-null   int64  
 1   time_dp                   1175 non-null   int64  
 2   travel_time               1175 non-null   float64
 3   easeof_online             1175 non-null   int64  
 4   home_env                  1175 non-null   int64  
 5   prod_inc                  1175 non-null   float64
 6   sleep_bal                 1175 non-null   float64
 7   new_skill                 1175 non-null   float64
 8   fam_connect               1175 non-null   float64
 9   relaxed                   1175 non-null   float64
 10  self_time                 1175 non-null   float64
 11  prefer                    1175 non-null   int64  
 12  gender_Male               1175 non-null   bool   
 13  gender_Prefer not to say  1175 non-null   bool   
dtypes: bool(

In [None]:
PscyhoCovid.head(10)

Unnamed: 0,time_bp,time_dp,travel_time,easeof_online,home_env,prod_inc,sleep_bal,new_skill,fam_connect,relaxed,self_time,prefer,gender_Male,gender_Prefer not to say
0,7,5,0.5,3,3,0.0,0.0,0.5,1.0,-0.5,-0.5,0,True,False
1,7,11,0.5,4,2,-0.5,0.5,-1.0,1.0,1.0,1.0,0,True,False
2,7,7,1.5,2,2,1.0,0.0,0.5,0.5,0.5,0.5,0,True,False
3,7,7,1.5,3,1,0.0,1.0,0.5,0.0,-1.0,-0.5,0,True,False
4,7,7,1.5,2,2,0.0,0.0,0.0,0.0,0.5,0.0,0,False,False
5,5,7,0.5,4,4,-1.0,-1.0,0.0,-0.5,0.0,0.0,0,True,False
6,5,4,0.5,1,3,0.5,0.5,1.0,0.5,1.0,1.0,1,True,False
7,7,9,0.5,3,2,0.0,-0.5,-0.5,-1.0,-0.5,0.0,0,True,False
8,5,9,0.5,1,2,1.0,1.0,1.0,1.0,1.0,1.0,1,True,False
9,7,7,1.5,2,2,0.0,0.0,0.0,-1.0,0.0,0.0,1,True,False


In [None]:
# Convert boolean columns to integers (0 and 1)
PscyhoCovid['gender_Male'] = PscyhoCovid['gender_Male'].astype(int)
PscyhoCovid['gender_Prefer not to say'] = PscyhoCovid['gender_Prefer not to say'].astype(int)

In [None]:
PscyhoCovid.head(10)

Unnamed: 0,time_bp,time_dp,travel_time,easeof_online,home_env,prod_inc,sleep_bal,new_skill,fam_connect,relaxed,self_time,prefer,gender_Male,gender_Prefer not to say
0,7,5,0.5,3,3,0.0,0.0,0.5,1.0,-0.5,-0.5,0,1,0
1,7,11,0.5,4,2,-0.5,0.5,-1.0,1.0,1.0,1.0,0,1,0
2,7,7,1.5,2,2,1.0,0.0,0.5,0.5,0.5,0.5,0,1,0
3,7,7,1.5,3,1,0.0,1.0,0.5,0.0,-1.0,-0.5,0,1,0
4,7,7,1.5,2,2,0.0,0.0,0.0,0.0,0.5,0.0,0,0,0
5,5,7,0.5,4,4,-1.0,-1.0,0.0,-0.5,0.0,0.0,0,1,0
6,5,4,0.5,1,3,0.5,0.5,1.0,0.5,1.0,1.0,1,1,0
7,7,9,0.5,3,2,0.0,-0.5,-0.5,-1.0,-0.5,0.0,0,1,0
8,5,9,0.5,1,2,1.0,1.0,1.0,1.0,1.0,1.0,1,1,0
9,7,7,1.5,2,2,0.0,0.0,0.0,-1.0,0.0,0.0,1,1,0


In [None]:
x = PscyhoCovid.drop(columns=["prefer"])
y = PscyhoCovid["prefer"]

In [None]:
#Split the data
train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [None]:
train_x.info()
train_y.info()
valid_x.info()
valid_y.info()

<class 'pandas.core.frame.DataFrame'>
Index: 940 entries, 1092 to 684
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   time_bp                   940 non-null    int64  
 1   time_dp                   940 non-null    int64  
 2   travel_time               940 non-null    float64
 3   easeof_online             940 non-null    int64  
 4   home_env                  940 non-null    int64  
 5   prod_inc                  940 non-null    float64
 6   sleep_bal                 940 non-null    float64
 7   new_skill                 940 non-null    float64
 8   fam_connect               940 non-null    float64
 9   relaxed                   940 non-null    float64
 10  self_time                 940 non-null    float64
 11  gender_Male               940 non-null    int64  
 12  gender_Prefer not to say  940 non-null    int64  
dtypes: float64(7), int64(6)
memory usage: 102.8 KB
<class 'pandas.core.

In [None]:
pip install dmba

Collecting dmba
  Downloading dmba-0.2.4-py3-none-any.whl (11.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8 MB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dmba
Successfully installed dmba-0.2.4


1. KNeighborsClassifier

In [None]:
scale = StandardScaler()
train_x2 = pd.DataFrame(scale.fit_transform(train_x), columns = train_x.columns)
valid_x2 = pd.DataFrame(scale.fit_transform(valid_x), columns = valid_x.columns)


print(train_x2.info())
print()
print(train_x2.head(24))

print(valid_x2.info())
print()
print(valid_x2.head(24))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 940 entries, 0 to 939
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   time_bp                   940 non-null    float64
 1   time_dp                   940 non-null    float64
 2   travel_time               940 non-null    float64
 3   easeof_online             940 non-null    float64
 4   home_env                  940 non-null    float64
 5   prod_inc                  940 non-null    float64
 6   sleep_bal                 940 non-null    float64
 7   new_skill                 940 non-null    float64
 8   fam_connect               940 non-null    float64
 9   relaxed                   940 non-null    float64
 10  self_time                 940 non-null    float64
 11  gender_Male               940 non-null    float64
 12  gender_Prefer not to say  940 non-null    float64
dtypes: float64(13)
memory usage: 95.6 KB
None

     time_bp   time_dp

In [None]:
overs = SMOTE()
unders = RandomUnderSampler(sampling_strategy = "majority")

x_train_OS2, y_train_OS2 = overs.fit_resample(train_x2, train_y)
x_train_US2, y_train_US2 = unders.fit_resample(train_x2, train_y)

print(y_train_OS2.value_counts())
print(y_train_US2.value_counts())

prefer
1    678
0    678
Name: count, dtype: int64
prefer
0    262
1    262
Name: count, dtype: int64


In [None]:
from sklearn.neighbors import KNeighborsClassifier
#list of different neighbors
neighbors = [1, 3, 5, 7, 9, 11, 13]

for k in neighbors:
  # original
  print("ORIGINAL")
  k1 = KNeighborsClassifier(n_neighbors = k)
  k1.fit(train_x2, train_y)
  predk1 = k1.predict(valid_x2)
  kDF1 = pd.DataFrame({"Actual": valid_y, "Predict": predk1})
  print(f"k: {k}")
  print(classification_report(valid_y, predk1))
  print()


  # Oversample
  print("OVERSAMPLE")
  k2 = KNeighborsClassifier(n_neighbors = k)
  k2.fit(x_train_OS2, y_train_OS2)
  predk2 = k2.predict(valid_x2)
  kDF2 = pd.DataFrame({"Actual": valid_y, "Predict": predk2})
  print(classification_report(valid_y, predk2))
  print()

  # undersample
  print("UNDERSAMPLE")
  k3 = KNeighborsClassifier(n_neighbors = k)
  k3.fit(x_train_US2, y_train_US2)
  predk3 = k3.predict(valid_x2)
  kDF3 = pd.DataFrame({"Actual": valid_y, "Predict": predk3})
  print(classification_report(valid_y, predk3))
  print()


ORIGINAL
k: 1
              precision    recall  f1-score   support

           0       0.93      0.97      0.95       158
           1       0.93      0.86      0.89        77

    accuracy                           0.93       235
   macro avg       0.93      0.91      0.92       235
weighted avg       0.93      0.93      0.93       235


OVERSAMPLE
              precision    recall  f1-score   support

           0       0.93      0.95      0.94       158
           1       0.89      0.86      0.87        77

    accuracy                           0.92       235
   macro avg       0.91      0.90      0.91       235
weighted avg       0.92      0.92      0.92       235


UNDERSAMPLE
              precision    recall  f1-score   support

           0       0.93      0.92      0.93       158
           1       0.85      0.86      0.85        77

    accuracy                           0.90       235
   macro avg       0.89      0.89      0.89       235
weighted avg       0.90      0.90  

2. Decision Tree classifier


In [None]:
# Initialize and train the Decision Tree classifier
clf = DecisionTreeClassifier()
clf.fit(train_x, train_y)
predclf = clf.predict(valid_x)
DFCLF1 = pd.DataFrame({"Actual": valid_y,
                    "Predicted": predclf .round(2),
                    "Residual": (valid_y - predclf ).round(2)})


print(classification_report(valid_y, predclf))

              precision    recall  f1-score   support

           0       0.91      0.93      0.92       158
           1       0.85      0.82      0.83        77

    accuracy                           0.89       235
   macro avg       0.88      0.87      0.88       235
weighted avg       0.89      0.89      0.89       235



In [None]:
x_train_OS, y_train_OS = overs.fit_resample(train_x, train_y)
x_train_US, y_train_US = unders.fit_resample(train_x, train_y)

clf2 = DecisionTreeClassifier()
clf2.fit(x_train_OS, y_train_OS)
predclf2 = clf2.predict(valid_x)
DFCLF2 = pd.DataFrame({"Actual": valid_y,
                    "Predicted": predclf2 .round(2),
                    "Residual": (valid_y - predclf2).round(2)})


print(classification_report(valid_y, predclf2))

              precision    recall  f1-score   support

           0       0.91      0.93      0.92       158
           1       0.85      0.82      0.83        77

    accuracy                           0.89       235
   macro avg       0.88      0.87      0.88       235
weighted avg       0.89      0.89      0.89       235



In [None]:
clf3 = DecisionTreeClassifier()
clf3.fit(x_train_US, y_train_US)
predclf3 = clf3.predict(valid_x)
DFCLF3 = pd.DataFrame({"Actual": valid_y,
                    "Predicted": predclf3 .round(2),
                    "Residual": (valid_y - predclf3).round(2)})


print(classification_report(valid_y, predclf3))

              precision    recall  f1-score   support

           0       0.92      0.90      0.91       158
           1       0.80      0.83      0.82        77

    accuracy                           0.88       235
   macro avg       0.86      0.86      0.86       235
weighted avg       0.88      0.88      0.88       235



3. Pruned Decision Tree Classifier

In [None]:
#Let us fine-tune this model.
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

In [None]:
parama = {'max_depth': [5, 10, 15, 20, 25],
          'min_samples_split': [50,100,150,200,250,300],
          'min_impurity_decrease': [0, 0.0005, 0.001, 0.005, 0.01]}

gsr = GridSearchCV(DecisionTreeRegressor(),
                   parama, cv = 5)
gsr.fit(train_x, train_y)

In [None]:
print(f"Score: {gsr.best_score_}")
print(f'Parameters: {gsr.best_params_}')

Score: 0.5864660930085552
Parameters: {'max_depth': 5, 'min_impurity_decrease': 0.0005, 'min_samples_split': 100}


In [None]:
# DecisionTreeClassifier with pruning
clf_pruned = DecisionTreeClassifier(max_depth=5,min_samples_split=100,
                           min_impurity_decrease=0.0005)
clf_pruned.fit(train_x, train_y)
predclf_pruned = clf_pruned.predict(valid_x)
DFCLF1_pruned = pd.DataFrame({"Actual": valid_y,
                              "Predicted": predclf_pruned.round(2),
                              "Residual": (valid_y - predclf_pruned).round(2)})

print(classification_report(valid_y, predclf_pruned))

              precision    recall  f1-score   support

           0       0.88      0.98      0.93       158
           1       0.95      0.71      0.81        77

    accuracy                           0.89       235
   macro avg       0.91      0.85      0.87       235
weighted avg       0.90      0.89      0.89       235



In [None]:
clf_pruned2 = DecisionTreeClassifier(max_depth=5,min_samples_split=100,
                           min_impurity_decrease=0.0005)
clf_pruned2.fit(x_train_OS, y_train_OS)
predclf_pruned2 = clf_pruned2.predict(valid_x)
DFCLF1_pruned2 = pd.DataFrame({"Actual": valid_y,
                              "Predicted": predclf_pruned2.round(2),
                              "Residual": (valid_y - predclf_pruned2).round(2)})

print(classification_report(valid_y, predclf_pruned2))

              precision    recall  f1-score   support

           0       0.90      0.94      0.92       158
           1       0.86      0.79      0.82        77

    accuracy                           0.89       235
   macro avg       0.88      0.86      0.87       235
weighted avg       0.89      0.89      0.89       235



In [None]:
clf_pruned3 = DecisionTreeClassifier(max_depth=5,min_samples_split=100,
                           min_impurity_decrease=0.0005)
clf_pruned3.fit(x_train_US, y_train_US)
predclf_pruned3 = clf_pruned3.predict(valid_x)
DFCLF1_pruned3 = pd.DataFrame({"Actual": valid_y,
                              "Predicted": predclf_pruned3.round(2),
                              "Residual": (valid_y - predclf_pruned3).round(2)})

print(classification_report(valid_y, predclf_pruned3))

              precision    recall  f1-score   support

           0       0.84      0.97      0.90       158
           1       0.90      0.61      0.73        77

    accuracy                           0.85       235
   macro avg       0.87      0.79      0.81       235
weighted avg       0.86      0.85      0.84       235



4. Naive Bayes

In [None]:
from scipy.stats import norm
train_x1 = norm.cdf(train_x)
valid_x1= norm.cdf(valid_x)

In [None]:
from sklearn.naive_bayes import GaussianNB
#Original
nb1 = GaussianNB()
nb1.fit(train_x1, train_y)
predNB1 = nb1.predict(valid_x1)
print(classification_report(valid_y,predNB1))

              precision    recall  f1-score   support

           0       0.83      0.82      0.83       158
           1       0.65      0.66      0.65        77

    accuracy                           0.77       235
   macro avg       0.74      0.74      0.74       235
weighted avg       0.77      0.77      0.77       235



In [None]:
#Oversample
x_train_OS1= norm.cdf(x_train_OS)
nb2 = GaussianNB()
nb2.fit(x_train_OS1, y_train_OS)
predNB2 = nb2.predict(valid_x1)
NBDF2 = pd.DataFrame({"Actual": valid_y, "Predict": predNB2})
print(classification_report(valid_y,predNB2))

              precision    recall  f1-score   support

           0       0.89      0.64      0.74       158
           1       0.53      0.83      0.65        77

    accuracy                           0.70       235
   macro avg       0.71      0.74      0.69       235
weighted avg       0.77      0.70      0.71       235



In [None]:
#Undersample
x_train_US1= norm.cdf(x_train_US)
nb3 = GaussianNB()
nb3.fit(x_train_US1, y_train_US)
predNB3 = nb3.predict(valid_x1)
NBDF3 = pd.DataFrame({"Actual": valid_y, "Predict": predNB3})
print(classification_report(valid_y,predNB3))

              precision    recall  f1-score   support

           0       0.89      0.72      0.80       158
           1       0.59      0.82      0.68        77

    accuracy                           0.75       235
   macro avg       0.74      0.77      0.74       235
weighted avg       0.79      0.75      0.76       235



5. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

# Create and fit the logistic regression model
#Original
LR = LogisticRegression(solver='lbfgs')
LR.fit(train_x, train_y)

# Predictions
predLR = LR.predict(valid_x)

# Classification report
print(classification_report(valid_y, predLR))

              precision    recall  f1-score   support

           0       0.87      0.96      0.91       158
           1       0.90      0.70      0.79        77

    accuracy                           0.88       235
   macro avg       0.88      0.83      0.85       235
weighted avg       0.88      0.88      0.87       235



In [None]:
#Oversample
LR2 = LogisticRegression(multi_class='multinomial', solver='lbfgs')
LR2.fit(x_train_OS, y_train_OS)

# Predictions
predLR2 = LR2.predict(valid_x)

# Classification report
print(classification_report(valid_y, predLR2))


              precision    recall  f1-score   support

           0       0.88      0.84      0.86       158
           1       0.69      0.77      0.73        77

    accuracy                           0.81       235
   macro avg       0.79      0.80      0.79       235
weighted avg       0.82      0.81      0.81       235



In [None]:
#Undersample
LR3 = LogisticRegression(multi_class='multinomial', solver='lbfgs')
LR3.fit(x_train_US, y_train_US)

# Predictions
predLR3 = LR3.predict(valid_x)

# Classification report
print(classification_report(valid_y, predLR3))

              precision    recall  f1-score   support

           0       0.91      0.82      0.86       158
           1       0.70      0.83      0.76        77

    accuracy                           0.83       235
   macro avg       0.80      0.83      0.81       235
weighted avg       0.84      0.83      0.83       235



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


6. SVC

In [None]:
from sklearn.svm import SVC

#original
# Create and fit the SVM model
svm = SVC(kernel='linear')
svm.fit(train_x, train_y)

# Predictions
predsvm = svm.predict(valid_x)

# Classification report
print(classification_report(valid_y, predsvm))

              precision    recall  f1-score   support

           0       0.84      0.99      0.91       158
           1       0.96      0.62      0.76        77

    accuracy                           0.87       235
   macro avg       0.90      0.81      0.83       235
weighted avg       0.88      0.87      0.86       235



In [None]:
#oversample
# Create and fit the SVM model
svm2 = SVC(kernel='linear')
svm2.fit(x_train_OS, y_train_OS)

# Predictions
predsvm2 = svm2.predict(valid_x)

# Classification report
print(classification_report(valid_y, predsvm2))

              precision    recall  f1-score   support

           0       0.89      0.89      0.89       158
           1       0.78      0.77      0.77        77

    accuracy                           0.85       235
   macro avg       0.83      0.83      0.83       235
weighted avg       0.85      0.85      0.85       235



In [None]:
#undersample
svm3 = SVC(kernel='linear')
svm3.fit(x_train_US, y_train_US)

# Predictions
predsvm3 = svm3.predict(valid_x)

# Classification report
print(classification_report(valid_y, predsvm3))

              precision    recall  f1-score   support

           0       0.90      0.91      0.91       158
           1       0.81      0.79      0.80        77

    accuracy                           0.87       235
   macro avg       0.86      0.85      0.85       235
weighted avg       0.87      0.87      0.87       235



7. Bagging

In [None]:
#baggingoriginal
BG = BaggingClassifier(DecisionTreeClassifier(random_state = 1),
                       n_estimators = 100, random_state = 1)
BG.fit(train_x, train_y)

predBG1 = BG.predict(valid_x)
print(classification_report(valid_y, predBG1))

              precision    recall  f1-score   support

           0       0.92      0.98      0.95       158
           1       0.95      0.82      0.88        77

    accuracy                           0.93       235
   macro avg       0.94      0.90      0.91       235
weighted avg       0.93      0.93      0.93       235



In [None]:
#baggingoversampled
BG2 = BaggingClassifier(DecisionTreeClassifier(random_state = 1),
                       n_estimators = 100, random_state = 1)
BG2.fit(x_train_OS, y_train_OS)

predBG2 = BG2.predict(valid_x)
print(classification_report(valid_y, predBG2))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94       158
           1       0.90      0.83      0.86        77

    accuracy                           0.91       235
   macro avg       0.91      0.89      0.90       235
weighted avg       0.91      0.91      0.91       235



In [None]:
#baggingundersample
BG3 = BaggingClassifier(DecisionTreeClassifier(random_state = 1),
                       n_estimators = 100, random_state = 1)
BG3.fit(x_train_US, y_train_US)

predBG3 = BG3.predict(valid_x)
print(classification_report(valid_y, predBG3))

              precision    recall  f1-score   support

           0       0.92      0.92      0.92       158
           1       0.83      0.83      0.83        77

    accuracy                           0.89       235
   macro avg       0.87      0.87      0.87       235
weighted avg       0.89      0.89      0.89       235



8. Boosting

In [None]:
#boostingoriginal
BT = AdaBoostClassifier(DecisionTreeClassifier(random_state=1),
                        n_estimators = 100, random_state=1)
BT.fit(train_x, train_y)

predBT1 = BT.predict(valid_x)
print(classification_report(valid_y,predBT1))

              precision    recall  f1-score   support

           0       0.91      0.96      0.93       158
           1       0.90      0.81      0.85        77

    accuracy                           0.91       235
   macro avg       0.90      0.88      0.89       235
weighted avg       0.91      0.91      0.90       235



In [None]:
#Boostingoversample
BT2 = AdaBoostClassifier(DecisionTreeClassifier(random_state=1),
                        n_estimators = 100, random_state=1)
BT2.fit(x_train_OS, y_train_OS)

predBT2 = BT2.predict(valid_x)
print(classification_report(valid_y,predBT2))

              precision    recall  f1-score   support

           0       0.92      0.94      0.93       158
           1       0.86      0.83      0.85        77

    accuracy                           0.90       235
   macro avg       0.89      0.88      0.89       235
weighted avg       0.90      0.90      0.90       235



In [None]:
#Boostingundersample
BT3 = AdaBoostClassifier(DecisionTreeClassifier(random_state=1),
                        n_estimators = 100, random_state=1)
BT3.fit(x_train_US, y_train_US)

predBT3 = BT3.predict(valid_x)
print(classification_report(valid_y,predBT3))

              precision    recall  f1-score   support

           0       0.92      0.89      0.90       158
           1       0.79      0.83      0.81        77

    accuracy                           0.87       235
   macro avg       0.85      0.86      0.86       235
weighted avg       0.87      0.87      0.87       235



9. Random Forest

In [None]:
#Random Forest original
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(n_estimators = 500, random_state = 1)
RF.fit(train_x, train_y)

predRF = RF.predict(valid_x)
print(classification_report(valid_y, predRF))

              precision    recall  f1-score   support

           0       0.92      0.97      0.94       158
           1       0.94      0.82      0.88        77

    accuracy                           0.92       235
   macro avg       0.93      0.90      0.91       235
weighted avg       0.92      0.92      0.92       235



In [None]:
#RandomForestOverSampled

RF2 = RandomForestClassifier(n_estimators = 500, random_state = 1)
RF2.fit(x_train_OS, y_train_OS)

predRF2 = RF2.predict(valid_x)
print(classification_report(valid_y, predRF2))

              precision    recall  f1-score   support

           0       0.91      0.97      0.94       158
           1       0.93      0.81      0.86        77

    accuracy                           0.91       235
   macro avg       0.92      0.89      0.90       235
weighted avg       0.92      0.91      0.91       235



In [None]:
#RandomForestUnderSampled

RF3 = RandomForestClassifier(n_estimators = 500, random_state = 1)
RF3.fit(x_train_US, y_train_US)

predRF3 = RF3.predict(valid_x)
print(classification_report(valid_y, predRF3))

              precision    recall  f1-score   support

           0       0.93      0.94      0.93       158
           1       0.87      0.84      0.86        77

    accuracy                           0.91       235
   macro avg       0.90      0.89      0.89       235
weighted avg       0.91      0.91      0.91       235



10. Neural Networks

In [None]:
#ALL THIS SHOULD BE RESAMPLED USING SMOTE AND RANDOMUNDERSAMPLER
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [None]:
train_X = scaler.fit_transform(train_x)
train_X = pd.DataFrame(train_X, columns = x.columns)
train_X.head()

Unnamed: 0,time_bp,time_dp,travel_time,easeof_online,home_env,prod_inc,sleep_bal,new_skill,fam_connect,relaxed,self_time,gender_Male,gender_Prefer not to say
0,0.625,0.125,0.4,0.25,0.5,0.75,0.75,0.75,0.75,0.75,0.75,1.0,0.0
1,0.625,0.625,0.0,0.25,0.0,0.75,0.25,0.25,0.75,0.5,0.5,0.0,0.0
2,0.125,0.125,0.0,0.0,0.5,0.5,0.5,0.75,0.75,0.25,0.75,1.0,0.0
3,0.125,0.375,0.4,0.75,0.75,0.25,0.5,0.25,0.75,0.25,0.5,0.0,0.0
4,0.625,0.375,0.4,0.75,0.75,0.0,0.0,0.75,0.75,0.75,0.75,0.0,0.0


In [None]:
valid_X = scaler.fit_transform(valid_x)
valid_X = pd.DataFrame(valid_X, columns = x.columns)
valid_X.head()



Unnamed: 0,time_bp,time_dp,travel_time,easeof_online,home_env,prod_inc,sleep_bal,new_skill,fam_connect,relaxed,self_time,gender_Male,gender_Prefer not to say
0,0.125,0.875,0.0,0.0,0.0,0.75,0.75,1.0,1.0,1.0,0.5,0.0,0.0
1,0.625,0.875,0.0,0.0,1.0,1.0,0.0,0.75,0.75,0.5,0.5,1.0,0.0
2,0.125,0.375,0.4,0.75,0.75,0.25,0.5,0.25,0.75,0.25,0.5,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.625,0.625,0.8,0.25,0.25,0.5,0.5,0.75,0.75,0.75,0.75,0.0,0.0


In [None]:
from sklearn.neural_network import MLPClassifier

nn = MLPClassifier(
    hidden_layer_sizes=(50, 50,50),  # Increased size of hidden layers
    activation="relu",
    solver="adam",
    learning_rate="adaptive",
    learning_rate_init=0.01,  # Increased initial learning rate
    max_iter=1000,  # Increased maximum number of iterations
    alpha=0.001,  # Increased regularization strength
    batch_size=64,  # Decreased batch size
    warm_start=False
)


In [None]:
model = nn.fit(train_X, train_y)

In [None]:
predictions = model.predict(valid_X)
print(predictions)

[1 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0
 0 0 0 1 1 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 1 1 1 0 1 0 0 0 1
 1 1 1 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0
 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 1 0 1 0
 1 0 0 1 1 1 0 1 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0
 1 1 0 0 0 1 0 0 0 0 0 1 0]


In [None]:
print(classification_report(valid_y, predictions))

              precision    recall  f1-score   support

           0       0.89      0.96      0.92       158
           1       0.91      0.75      0.82        77

    accuracy                           0.89       235
   macro avg       0.90      0.86      0.87       235
weighted avg       0.89      0.89      0.89       235



In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
overs = SMOTE()
unders = RandomUnderSampler()
x_train_OS, y_train_OS = overs.fit_resample(train_X, train_y)
x_train_US, y_train_US = unders.fit_resample(train_X, train_y)

In [None]:
x_train_OS = scaler.fit_transform(x_train_OS)
x_train_OS= pd.DataFrame(x_train_OS, columns = x.columns)


In [None]:
x_train_US = scaler.fit_transform(x_train_US)
x_train_US= pd.DataFrame(x_train_US, columns = x.columns)


In [None]:
model = nn.fit(x_train_OS, y_train_OS)

In [None]:
predictions = model.predict(valid_X)
print(predictions)

[1 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0
 0 0 0 1 1 0 1 1 0 0 1 1 0 0 1 0 0 0 0 0 0 1 0 1 0 1 1 0 1 1 1 0 1 0 0 0 1
 1 1 1 0 0 0 1 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 0
 1 0 0 1 1 1 0 1 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0
 1 1 0 0 0 1 0 0 0 0 0 1 0]


In [None]:
print(classification_report(valid_y, predictions))

              precision    recall  f1-score   support

           0       0.89      0.96      0.92       158
           1       0.91      0.75      0.82        77

    accuracy                           0.89       235
   macro avg       0.90      0.86      0.87       235
weighted avg       0.89      0.89      0.89       235



In [None]:
model = nn.fit(x_train_US, y_train_US)

In [None]:
predictions = model.predict(valid_X)
print(predictions)

[1 0 0 0 0 0 0 0 1 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 0 1 0 0 1 0 0 0 1 0 1 0 1
 0 0 0 1 1 0 1 1 1 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 1 1 1 0 1 0 0 0 1
 1 0 1 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 0
 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 0
 1 0 0 1 1 1 0 1 0 1 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 1 1 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0
 1 1 0 0 1 1 0 0 0 0 0 1 0]


In [None]:
print(classification_report(valid_y, predictions))

              precision    recall  f1-score   support

           0       0.90      0.90      0.90       158
           1       0.79      0.81      0.80        77

    accuracy                           0.87       235
   macro avg       0.85      0.85      0.85       235
weighted avg       0.87      0.87      0.87       235



In [None]:
# Discriminant Analysis Classifier on original dataset
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(train_x, train_y)

lda_pred = lda.predict(valid_x)

lda_pred_df = pd.DataFrame({"Actual": valid_y, "Predicted": lda_pred})
print(lda_pred_df.head(5))
print()
print(classification_report(valid_y, lda_pred))

      Actual  Predicted
361        1          1
772        0          0
1052       0          0
794        0          0
847        0          0

              precision    recall  f1-score   support

           0       0.86      0.97      0.91       158
           1       0.91      0.69      0.79        77

    accuracy                           0.88       235
   macro avg       0.89      0.83      0.85       235
weighted avg       0.88      0.88      0.87       235



In [None]:
# Linear Discriminant Analysis Classifier on oversampled dataset

lda.fit(x_train_OS, y_train_OS)

lda_OS_pred = lda.predict(valid_x2)

lda_OS_pred_df = pd.DataFrame({"Actual": valid_y, "Predicted": lda_OS_pred})
print(lda_OS_pred_df.head(5))
print()
print(classification_report(valid_y, lda_OS_pred))

      Actual  Predicted
361        1          1
772        0          1
1052       0          0
794        0          0
847        0          0

              precision    recall  f1-score   support

           0       0.88      0.71      0.79       158
           1       0.57      0.81      0.67        77

    accuracy                           0.74       235
   macro avg       0.73      0.76      0.73       235
weighted avg       0.78      0.74      0.75       235



In [None]:
# Linear Discriminant Analysis Classifier on undersampled dataset

lda.fit(x_train_US, y_train_US)

lda_US_pred = lda.predict(valid_x)

lda_US_pred_df = pd.DataFrame({"Actual": valid_y, "Predicted": lda_US_pred})
print(lda_US_pred_df.head(5))
print()
print(classification_report(valid_y, lda_US_pred))

      Actual  Predicted
361        1          0
772        0          0
1052       0          0
794        0          0
847        0          0

              precision    recall  f1-score   support

           0       0.67      1.00      0.80       158
           1       0.00      0.00      0.00        77

    accuracy                           0.67       235
   macro avg       0.34      0.50      0.40       235
weighted avg       0.45      0.67      0.54       235



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
