# Ensemble Learning for Cyberbullying Detection

In [32]:
#Importing libraries and dependencies 
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score


import shap


from hyperopt import fmin, hp, tpe

import matplotlib.pyplot as plt
import seaborn as sns

#Set display options to None 
pd.options.display.max_columns = None
pd.options.display.max_rows = None

#Load in the dataset 
dataset = pd.read_csv('../input/cyberbullying/all_features.csv', index_col=0)

# #Create X and y partition 
X = dataset.drop('cyberbullying_type',axis=1)
y = dataset['cyberbullying_type']

# Perform train-test split on the dataset 
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=40)

train_df = dataset.sample(frac=1,random_state=1).reset_index(drop=True)
train_x = list(train_df.drop(['cyberbullying_type'], axis=1).columns)


def CV_catboost(X_train,y_train,X_test,y_test):

    y_train = np.transpose(np.array(y_train)).ravel()
    y_test = np.transpose(np.array(y_test)).ravel()

    # Training
    train_dataset = Pool(data=X_train,label=y_train)
    #Test
    eval_dataset = Pool(data=X_test,label=y_test)

    # set parameters of model
    model = CatBoostClassifier(
        iterations=10000,
        random_strength=1, #set to one to prevent overfitting
        depth=6, #relatively low to prevent overfitting
        l2_leaf_reg=2, #from grid search
        border_count=20, #from grid search
        learning_rate=0.01,
        rsm=1, #from grid search
        loss_function='MultiClass',
        eval_metric='Accuracy',
        boosting_type = 'Plain',
        silent=True,
        task_type="GPU")

    # fit model and make predictions for cross-validation 
    # model.fit(train_dataset, plot=True, eval_set=eval_dataset)
    model.fit(train_dataset,verbose_eval=100,eval_set=eval_dataset,plot=True)

    #Make prediction on out-of-sample set 
    y_pred = model.predict(X_test)  # change to test_x if for real test

    f1 = f1_score(y_test, y_pred, average='macro')
    accuracy = accuracy_score(y_test, y_pred)

    report = classification_report(y_test, y_pred)
    print(report)

    return [f1, accuracy]

In [21]:

x_cols = train_x

y_col = ['cyberbullying_type']

nb_splits = 5
splits =np.array_split(train_df, nb_splits)

# run the CV_catboost in a loop

f1_all_catboost = [0,0,0,0,0]
accuracy_all_catboost = [0,0,0,0,0]
scores_catboost = [0,0,0,0,0]
for i in range(nb_splits):
    not_test_indices = [x for x in range(nb_splits) if x != i]
    temp_df_X_y = (pd.concat([splits[x] for x in not_test_indices],axis=0))
    X_train = temp_df_X_y[x_cols]
    y_train = temp_df_X_y[y_col]
    X_test = splits[i][x_cols]
    y_test = splits[i][y_col]

    scores_catboost[i] = CV_catboost(X_train,y_train,X_test,y_test)

for i in range(len(scores_catboost)):
    f1_all_catboost[i] = scores_catboost[i][0]
    accuracy_all_catboost[i] = scores_catboost[i][1]

print("*"*100)
print("The cross validated f1 score is " + str(np.mean(f1_all_catboost)))
print("The cross validated accuracy is " + str(np.mean(accuracy_all_catboost)))


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6448248	test: 0.6473425	best: 0.6473425 (0)	total: 11.3ms	remaining: 1m 52s
100:	learn: 0.7220402	test: 0.7243946	best: 0.7254429 (99)	total: 964ms	remaining: 1m 34s
200:	learn: 0.7454984	test: 0.7516511	best: 0.7516511 (200)	total: 1.91s	remaining: 1m 33s
300:	learn: 0.7715776	test: 0.7772303	best: 0.7772303 (299)	total: 3.2s	remaining: 1m 43s
400:	learn: 0.7884308	test: 0.7941084	best: 0.7941084 (400)	total: 4.29s	remaining: 1m 42s
500:	learn: 0.7997798	test: 0.8022854	best: 0.8024950 (496)	total: 5.22s	remaining: 1m 38s
600:	learn: 0.8079574	test: 0.8099381	best: 0.8100430 (598)	total: 6.15s	remaining: 1m 36s
700:	learn: 0.8127015	test: 0.8130831	best: 0.8131880 (684)	total: 7.09s	remaining: 1m 34s
800:	learn: 0.8167379	test: 0.8153895	best: 0.8154943 (791)	total: 8.02s	remaining: 1m 32s
900:	learn: 0.8191492	test: 0.8167523	best: 0.8170668 (898)	total: 9.66s	remaining: 1m 37s
1000:	learn: 0.8218489	test: 0.8187441	best: 0.8192683 (996)	total: 10.6s	remaining: 1m 35s
110

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6435143	test: 0.6527938	best: 0.6527938 (0)	total: 10.9ms	remaining: 1m 48s
100:	learn: 0.7243467	test: 0.7203061	best: 0.7215641 (99)	total: 922ms	remaining: 1m 30s
200:	learn: 0.7490892	test: 0.7438935	best: 0.7438935 (200)	total: 1.88s	remaining: 1m 31s
300:	learn: 0.7725212	test: 0.7673760	best: 0.7673760 (297)	total: 2.81s	remaining: 1m 30s
400:	learn: 0.7908421	test: 0.7837300	best: 0.7837300 (400)	total: 4.43s	remaining: 1m 46s
500:	learn: 0.8009855	test: 0.7929552	best: 0.7933746 (495)	total: 5.96s	remaining: 1m 52s
600:	learn: 0.8085865	test: 0.8001887	best: 0.8001887 (592)	total: 6.85s	remaining: 1m 47s
700:	learn: 0.8118628	test: 0.8064787	best: 0.8064787 (699)	total: 7.78s	remaining: 1m 43s
800:	learn: 0.8154274	test: 0.8122445	best: 0.8122445 (798)	total: 8.71s	remaining: 1m 40s
900:	learn: 0.8186512	test: 0.8151798	best: 0.8151798 (900)	total: 9.66s	remaining: 1m 37s
1000:	learn: 0.8215868	test: 0.8169619	best: 0.8169619 (1000)	total: 10.6s	remaining: 1m 35s
1

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6428946	test: 0.6427972	best: 0.6427972 (0)	total: 11.1ms	remaining: 1m 51s
100:	learn: 0.7251664	test: 0.7239463	best: 0.7241560 (94)	total: 954ms	remaining: 1m 33s
200:	learn: 0.7471038	test: 0.7461732	best: 0.7461732 (200)	total: 2.26s	remaining: 1m 50s
300:	learn: 0.7698013	test: 0.7642063	best: 0.7642063 (300)	total: 3.41s	remaining: 1m 49s
400:	learn: 0.7871783	test: 0.7826588	best: 0.7826588 (400)	total: 4.49s	remaining: 1m 47s
500:	learn: 0.7994706	test: 0.7934577	best: 0.7936674 (498)	total: 5.51s	remaining: 1m 44s
600:	learn: 0.8054988	test: 0.7997484	best: 0.8001678 (598)	total: 6.41s	remaining: 1m 40s
700:	learn: 0.8110552	test: 0.8067729	best: 0.8068778 (699)	total: 7.32s	remaining: 1m 37s
800:	learn: 0.8156943	test: 0.8110715	best: 0.8110715 (765)	total: 8.24s	remaining: 1m 34s
900:	learn: 0.8188656	test: 0.8144265	best: 0.8146362 (889)	total: 9.17s	remaining: 1m 32s
1000:	learn: 0.8211983	test: 0.8152653	best: 0.8161040 (965)	total: 10.1s	remaining: 1m 30s
11

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6478482	test: 0.6356679	best: 0.6356679 (0)	total: 11.2ms	remaining: 1m 51s
100:	learn: 0.7266866	test: 0.7198574	best: 0.7198574 (100)	total: 1.19s	remaining: 1m 56s
200:	learn: 0.7470776	test: 0.7403020	best: 0.7406165 (194)	total: 2.65s	remaining: 2m 9s
300:	learn: 0.7706925	test: 0.7651499	best: 0.7651499 (300)	total: 3.55s	remaining: 1m 54s
400:	learn: 0.7888557	test: 0.7880059	best: 0.7882156 (396)	total: 4.46s	remaining: 1m 46s
500:	learn: 0.8004927	test: 0.8009017	best: 0.8009017 (499)	total: 5.37s	remaining: 1m 41s
600:	learn: 0.8061802	test: 0.8056196	best: 0.8056196 (578)	total: 6.27s	remaining: 1m 38s
700:	learn: 0.8109766	test: 0.8094988	best: 0.8097085 (699)	total: 7.31s	remaining: 1m 37s
800:	learn: 0.8157467	test: 0.8128538	best: 0.8131684 (797)	total: 8.47s	remaining: 1m 37s
900:	learn: 0.8188394	test: 0.8152653	best: 0.8155798 (895)	total: 9.38s	remaining: 1m 34s
1000:	learn: 0.8216963	test: 0.8162088	best: 0.8165234 (978)	total: 10.3s	remaining: 1m 32s
11

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6454893	test: 0.6451038	best: 0.6451038 (0)	total: 47.8ms	remaining: 7m 57s
100:	learn: 0.7234890	test: 0.7277207	best: 0.7277207 (100)	total: 1.2s	remaining: 1m 57s
200:	learn: 0.7476542	test: 0.7500524	best: 0.7500524 (200)	total: 2.16s	remaining: 1m 45s
300:	learn: 0.7702993	test: 0.7718599	best: 0.7718599 (300)	total: 3.11s	remaining: 1m 40s
400:	learn: 0.7889867	test: 0.7873768	best: 0.7875865 (399)	total: 4.06s	remaining: 1m 37s
500:	learn: 0.8004403	test: 0.7985951	best: 0.7985951 (500)	total: 5s	remaining: 1m 34s
600:	learn: 0.8077266	test: 0.8033131	best: 0.8040470 (582)	total: 5.95s	remaining: 1m 33s
700:	learn: 0.8133616	test: 0.8065632	best: 0.8065632 (698)	total: 6.89s	remaining: 1m 31s
800:	learn: 0.8164282	test: 0.8096037	best: 0.8096037 (799)	total: 7.84s	remaining: 1m 30s
900:	learn: 0.8188394	test: 0.8114909	best: 0.8118054 (888)	total: 8.81s	remaining: 1m 28s
1000:	learn: 0.8215652	test: 0.8132732	best: 0.8134829 (997)	total: 9.96s	remaining: 1m 29s
1100:

# CatBoost Prediction 

In [23]:
df = dataset.sample(frac=1,random_state=1).reset_index(drop=True)

section = round(len(df)*8/10)
train_df = df.iloc[:section]
test_df = df.iloc[section:]

train_cols_x = list(train_df.drop(['cyberbullying_type'], axis=1).columns)
test_cols_x = list(test_df.drop(['cyberbullying_type'], axis=1).columns)


train_x = train_df[train_cols_x]
test_x = test_df[test_cols_x]

train_y = train_df["cyberbullying_type"]
test_y = test_df["cyberbullying_type"]
test_y = test_y.reset_index(drop=True)
test_x = test_x.reset_index(drop=True)


train_y = np.transpose(np.array(train_y)).ravel()
test_y = np.transpose(np.array(test_y)).ravel()

#define dataset used to train the model
train_dataset = Pool(data=train_x,
                     label=train_y,
                     )

#define dataset used to test the model
eval_dataset = Pool(data=test_x)

#set model parameters
model = CatBoostClassifier(
        iterations=10000,
        random_strength=1, #set to one to prevent overfitting
        depth=6, #relatively low to prevent overfitting
        l2_leaf_reg=2, #from grid search
        border_count=20, #from grid search
        learning_rate=0.01,
        rsm=1, #from grid search
        loss_function='MultiClass',
        eval_metric='Accuracy',
        boosting_type = 'Plain',
        silent=True,
        task_type="GPU")


model.fit(train_dataset, plot=True)

importances = model.feature_importances_


df_feat_imp = pd.DataFrame()
df_feat_imp['Features'] = train_cols_x
df_feat_imp['Importance'] = importances.tolist()
df_feat_imp = df_feat_imp.sort_values(by='Importance',ascending=False)
print(df_feat_imp.head(10))
# df_feat_imp.to_csv("feature_importance_combo.csv")

y_pred = model.predict(test_x)



f1 = f1_score(test_y, y_pred, average='macro')
accuracy = accuracy_score(test_y, y_pred)
print("f1 score is " + str(f1))
print("accuracy is " + str(accuracy))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

                 Features  Importance
0    characters per tweet    7.031804
143                school    4.844225
64                      @    4.834997
118                 bully    4.758607
4          nb_capitalized    4.309246
83                 nigger    3.316220
109                 idiot    3.139793
34                      i    2.575371
125                muslim    2.417679
91                muslims    2.241238
f1 score is 0.8267433517741227
accuracy is 0.8250157265674145


# Random Forests 

In [42]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report, f1_score
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import skew

## reading in the training dataset - there are various training sets to consider
## based on how



## extract the column names from the dataframe.
train_x_columns = list(X.columns)


#Creating a Categorical Encoder pipeline 
c_encoder = LabelEncoder()
y = c_encoder.fit_transform(y) #Y is categorical, so it needs to be converted

## splitting the training data into a training set (80%) and a test set (20%)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=40)

rf = RandomForestClassifier(criterion = "gini", max_depth=25, min_samples_leaf = 1, n_estimators=300, 
                            random_state=0, class_weight = 'balanced_subsample', bootstrap=True, n_jobs=-1)

## fitting the model to the train data split
fit_model = True
if fit_model:
    print('Training model')
    rf.fit(X_train, y_train)

# ## using the model to predict the y test values using the test data
print('Predicting on test set now')
y_pred = rf.predict(X_test)

## Test scores
print(classification_report(y_test, y_pred))
print('F1 score:',round(f1_score(y_test,y_pred, average = 'macro'),3))

runCV = True
## run cross validation on fitted model
scoring = {'acc': 'accuracy',
           'f1_macro': 'f1_macro'}
if runCV:
    scores = cross_validate(rf, X_train, y_train, cv=5, scoring=scoring)
    #print(scores)
    #print(np.average(scores))
    print("-----------------------------------")
    print("CV Accuracy:", round(np.average(scores['test_acc']),4))
    print("CV f1 Score:", round(np.average(scores['test_f1_macro']),4))
    print("-----------------------------------")


## grid search for random forest.
gs = True
if gs:
    print('Starting grid search for optimal parameters')
    rf_gs = RandomForestClassifier()
    param_grid = {
        'class_weight': ["balanced",'balanced_subsample'],
        'max_depth': [10,20,30],
        'n_estimators': [100,200],
        'min_samples_leaf': [1,2,4],
    }
    
    grid_search = GridSearchCV(rf_gs, param_grid, cv=5, scoring='f1_macro', verbose = 10,n_jobs = -1)
    grid_search.fit(X_train, y_train)

    print(("best RF from grid search: %.3f"
           % grid_search.score(X_train, y_train)))
    
print('Grid Search best parameters: {}'.format(grid_search.best_params_))

# code to extract the most important features
get_imp = True
if get_imp:    
    importances = rf.steps[1][1].feature_importances_
    feature_names = rf['preprocessor'].transformers_[1][1]['onehot']\
                   .get_feature_names(categorical_features)               
    df_feat_imp = pd.DataFrame()
    df_feat_imp['Features'] = numeric_features + list(feature_names)
    df_feat_imp['Importance'] = importances.tolist()
    df_feat_imp = df_feat_imp.sort_values(by='Importance',ascending=False)
    print(df_feat_imp.head(10))
    df_feat_imp.to_csv("feature_importance.csv")    
    

Training model
Predicting on test set now
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      1582
           1       0.99      0.90      0.94      1628
           2       0.93      0.79      0.85      1593
           3       0.57      0.54      0.56      1569
           4       0.55      0.73      0.63      1551
           5       0.96      0.94      0.95      1616

    accuracy                           0.81      9539
   macro avg       0.83      0.81      0.82      9539
weighted avg       0.83      0.81      0.82      9539

F1 score: 0.818
-----------------------------------
CV Accuracy: 0.8078
CV f1 Score: 0.8132
-----------------------------------
Starting grid search for optimal parameters
Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5; 1/36] START class_weight=balanced, max_depth=10, min_samples_leaf=1, n_estimators=100
[CV 1/5; 1/36] END class_weight=balanced, max_depth=10, min_samples_leaf=1, n_estimato

A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



[CV 3/5; 15/36] START class_weight=balanced, max_depth=30, min_samples_leaf=2, n_estimators=100
[CV 3/5; 15/36] END class_weight=balanced, max_depth=30, min_samples_leaf=2, n_estimators=100;, score=0.809 total time=   7.9s
[CV 5/5; 15/36] START class_weight=balanced, max_depth=30, min_samples_leaf=2, n_estimators=100
[CV 5/5; 15/36] END class_weight=balanced, max_depth=30, min_samples_leaf=2, n_estimators=100;, score=0.811 total time=   8.4s
[CV 2/5; 16/36] START class_weight=balanced, max_depth=30, min_samples_leaf=2, n_estimators=200
[CV 2/5; 16/36] END class_weight=balanced, max_depth=30, min_samples_leaf=2, n_estimators=200;, score=0.810 total time=  17.5s
[CV 4/5; 16/36] START class_weight=balanced, max_depth=30, min_samples_leaf=2, n_estimators=200
[CV 4/5; 16/36] END class_weight=balanced, max_depth=30, min_samples_leaf=2, n_estimators=200;, score=0.811 total time=  16.5s
[CV 1/5; 17/36] START class_weight=balanced, max_depth=30, min_samples_leaf=4, n_estimators=100
[CV 1/5; 17

AttributeError: 'RandomForestClassifier' object has no attribute 'steps'

(9539,)


[CV 3/5; 29/36] END class_weight=balanced_subsample, max_depth=20, min_samples_leaf=4, n_estimators=100;, score=0.810 total time=   8.1s
[CV 5/5; 29/36] START class_weight=balanced_subsample, max_depth=20, min_samples_leaf=4, n_estimators=100
[CV 5/5; 29/36] END class_weight=balanced_subsample, max_depth=20, min_samples_leaf=4, n_estimators=100;, score=0.814 total time=   8.3s
[CV 2/5; 30/36] START class_weight=balanced_subsample, max_depth=20, min_samples_leaf=4, n_estimators=200
[CV 2/5; 30/36] END class_weight=balanced_subsample, max_depth=20, min_samples_leaf=4, n_estimators=200;, score=0.816 total time=  17.0s
[CV 4/5; 30/36] START class_weight=balanced_subsample, max_depth=20, min_samples_leaf=4, n_estimators=200
[CV 4/5; 30/36] END class_weight=balanced_subsample, max_depth=20, min_samples_leaf=4, n_estimators=200;, score=0.815 total time=  16.5s
[CV 1/5; 31/36] START class_weight=balanced_subsample, max_depth=30, min_samples_leaf=1, n_estimators=100
[CV 1/5; 31/36] END class_w

In [48]:
#Training the best RF model with the gridsearch parameters 
rf_best = RandomForestClassifier(criterion = "gini", max_depth=20, min_samples_leaf = 1, n_estimators=200, 
                            random_state=0, class_weight = 'balanced_subsample', bootstrap=True, n_jobs=-1)

rf_best.fit(X_train,y_train)
y_pred = rf_best.predict(X_test)
print(classification_report(y_test, y_pred))
print('F1 score:',round(f1_score(y_test,y_pred, average = 'macro'),3))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1582
           1       0.99      0.90      0.94      1628
           2       0.94      0.78      0.85      1593
           3       0.59      0.52      0.55      1569
           4       0.55      0.78      0.64      1551
           5       0.96      0.93      0.94      1616

    accuracy                           0.82      9539
   macro avg       0.83      0.81      0.82      9539
weighted avg       0.84      0.82      0.82      9539

F1 score: 0.819


# XGBoost Classifier

In [55]:
from xgboost import XGBClassifier


#implementation of XGBoost classifier
model = XGBClassifier()
xg_model = model.fit(X_train, np.ravel(y_train))

#cv scores
scores = cross_val_score(xg_model, X_train, np.ravel(y_train), cv=5, scoring='f1_macro')
print(f'CV scores: {scores}')
print(f'Mean CV scores: {np.mean(scores)}')


#Perform prediction 
y_pred_xgboost = xg_model.predict(X_test)
print(classification_report(y_test,y_pred))





















CV scores: [0.8329006  0.83050686 0.82689238 0.82503561 0.81925705]
Mean CV scores: 0.8269185009679015
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1582
           1       0.99      0.90      0.94      1628
           2       0.94      0.78      0.85      1593
           3       0.59      0.52      0.55      1569
           4       0.55      0.78      0.64      1551
           5       0.96      0.93      0.94      1616

    accuracy                           0.82      9539
   macro avg       0.83      0.81      0.82      9539
weighted avg       0.84      0.82      0.82      9539

