In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from imblearn.over_sampling import SMOTE
import joblib

In [2]:
df = pd.read_csv('../data/cleaning_step2.csv')

In [3]:
df.drop(['bullied_outside_school', 'cyberbullied'], axis=1, inplace=True)

* _downsampling_

In [4]:
cat0 = df[df['bullied_in_school'] == 0]
cat1 = df[df['bullied_in_school'] == 1]

c0_len = len(cat0)
c1_len = len(cat1)

c0_len, c1_len

(16502, 6264)

In [5]:
#downsample
cat0_down = cat0.sample(c1_len)

In [6]:
# reassemble
df = pd.concat([cat0_down, cat1], axis=0)

# shuffle
df = df.sample(frac=1) #frac = ratio of the shuffled output to the input size
df = df.reset_index(drop=True)

In [7]:
X = df.drop(['bullied_in_school'], axis=1)
y = df[['bullied_in_school']]

In [8]:
tt_ratio = 0.3
rand_seed = 42

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=tt_ratio, random_state=rand_seed)
#X_train = pd.DataFrame(X_train)
#X_test = pd.DataFrame(X_test)

In [9]:
model = RandomForestClassifier(n_estimators=2000, min_samples_split=25, random_state=42)
model.fit(X_train, y_train.to_numpy().ravel())

y_pred = model.predict(X_test)
precision_score(y_test, y_pred)

0.6022988505747127

In [10]:
#save model
joblib.dump(model, 'model_inside.joblib', compress=3)

['model_inside.joblib']

* _upsampling_

In [11]:
df = pd.read_csv('../data/cleaning_step2.csv')
df.drop(['bullied_outside_school', 'cyberbullied'], axis=1, inplace=True)

In [12]:
cat0 = df[df['bullied_in_school'] == 0]
cat1 = df[df['bullied_in_school'] == 1]

c0_len = len(cat0)
c1_len = len(cat1)

c0_len, c1_len

(16502, 6264)

In [13]:
X = df.drop(['bullied_in_school'], axis=1)
y = df[['bullied_in_school']]

In [14]:
tt_ratio = 0.3
rand_seed = 42

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=tt_ratio, random_state=rand_seed)

In [15]:
#upsample
smote = SMOTE()
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)
y_train_sm.value_counts()

bullied_in_school
0                    11513
1                    11513
dtype: int64

In [16]:
model = RandomForestClassifier(n_estimators=2000, min_samples_split=15, random_state=42)
model.fit(X_train_sm, y_train_sm.to_numpy().ravel())

y_pred = model.predict(X_test)
precision_score(y_test, y_pred)

0.34339457567804027