In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('../data/cyberbullying_tweets_clean.csv')

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop('cyberbullying_type',axis=1)
y = df['cyberbullying_type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state=42)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


transformer = Pipeline(steps=[('countVectorizer', CountVectorizer()),
                                               ('tfIdfTransformer', TfidfTransformer())])

preprocessor = ColumnTransformer(transformers=[('transformer', transformer, 'tweet_text')
                                              ],remainder='passthrough')


classifier = Pipeline(steps=[('logisticRegressionClassifier', LogisticRegression(max_iter=5000))])

In [None]:
model = Pipeline(steps=[('preprocessor', preprocessor), 
                        ('classifier', classifier)])

In [None]:
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import f1_score

y_pred = model.predict(X_test)
f1_metric = f1_score(y_test, y_pred, average='weighted')
f1_metric

In [None]:
# Hyperparameter optimisation

from skopt import BayesSearchCV
from sklearn.metrics import f1_score, make_scorer

f1_scorer = make_scorer(f1_score, average='weighted')

tunable_parameters = {
    'solver' : ['newton-cg', 'lbfgs', 'liblinear'],
    'penalty' : ['l2'],
    'C' : [100, 10, 1.0, 0.1, 0.01]
}


classifier = Pipeline(steps=[('logisticRegressionClassifier', 
                              BayesSearchCV(LogisticRegression(max_iter=1000), 
                                            tunable_parameters, 
                                            n_iter=50, 
                                            cv=5, 
                                            scoring=f1_scorer,
                                            n_jobs=5))])


model = Pipeline(steps=[('preprocessor', preprocessor), 
                        ('classifier', classifier)])

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
f1_metric = f1_score(y_test, y_pred, average='weighted')
f1_metric

In [None]:
# Use trees, explain feature and word importances, make ensemble model finally using XGBoost and Logistic Regression, 
# Deploy and make a local API to get result for sentence 
# Maybe a web page that checks twitter URL and returns result?