In [110]:
import pandas as pd
import numpy as np
import re

from plotnine import *
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVC, SVR
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [91]:
df = pd.read_csv("../data/cyberbullying_tweets.csv")

In [92]:
df.shape

(47692, 2)

In [93]:
#https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python
def remove_urls (vTEXT):
    vTEXT = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', vTEXT, flags=re.MULTILINE)
    vTEXT = re.sub('@[^\s]+','', vTEXT)
    return(vTEXT)

In [94]:
#remove links 
df['no_links_text'] = [remove_urls(i) for i in df['tweet_text']]

#character and word length
df['char_len'] = [len(i) for i in df.no_links_text]
df['word_count'] = [len(i.split()) for i in df.no_links_text]


df['contains_link'] = np.where(df['tweet_text'].str.contains('http'),True,False)
df['is_retweet'] = np.where(df['tweet_text'].str.contains('RT'),True,False)


df.head(5)

Unnamed: 0,tweet_text,cyberbullying_type,no_links_text,char_len,word_count,contains_link,is_retweet
0,"In other words #katandandre, your food was cra...",not_cyberbullying,"In other words #katandandre, your food was cra...",61,9,False,False
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying,Why is #aussietv so white? #MKR #theblock #ImA...,115,14,False,False
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying,a classy whore? Or more red velvet cupcakes?,45,8,False,False
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying,"meh. :P thanks for the heads up, but not too...",93,17,False,False
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying,This is an ISIS account pretending to be a Ku...,89,17,False,False


In [95]:
df = pd.read_csv(r'C:\Users\Dask\Downloads\cleaned_tweets_sample.csv')

In [96]:
df = df.dropna()

In [97]:
X = df['no_links_text']
y = df['cyberbullying_type']

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state = 42,
                                                    stratify=y,
                                                    test_size = 0.2)

In [113]:
cv = CountVectorizer()
tf = TfidfVectorizer()

In [None]:
#support vector

In [114]:
svc = SVC()
pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('svc', SVC())
])
pipe.fit(X_train, y_train)

pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.9078810020876826, 0.7693110647181628)

In [117]:
svc = SVC()
pipe = Pipeline([
    ('tf', TfidfVectorizer()),
    ('svc', SVC())
])
pipe.fit(X_train, y_train)

pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.9903444676409185, 0.7891440501043842)

In [None]:
#random forests

In [118]:
rfc = RandomForestClassifier(n_estimators = 100)
pipe = Pipeline([
    ('tf', TfidfVectorizer()),
    ('rfc', RandomForestClassifier())
])
pipe.fit(X_train, y_train)

pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.994258872651357, 0.7964509394572025)

In [119]:
rfc = RandomForestClassifier(n_estimators = 100)
pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('rfc', RandomForestClassifier())
])
pipe.fit(X_train, y_train)

pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.994258872651357, 0.8048016701461378)

In [None]:
#logistic regression

In [102]:
logreg = LogisticRegression()
pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('logreg', LogisticRegression())
])
pipe.fit(X_train, y_train)

pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.9856471816283925, 0.7985386221294363)

In [None]:
#KNN

In [103]:
knc = KNeighborsClassifier()
pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('knc', KNeighborsClassifier())
])
pipe.fit(X_train, y_train)

pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.6495302713987474, 0.4812108559498956)

In [None]:
#Decision trees

In [104]:
dtc = DecisionTreeClassifier()
pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('dtc', DecisionTreeClassifier())
])
pipe.fit(X_train, y_train)

pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.994258872651357, 0.7745302713987474)

In [None]:
#ADAboosted

In [105]:
abc = AdaBoostClassifier()
pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('abc', AdaBoostClassifier())
])
pipe.fit(X_train, y_train)

pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.6829331941544885, 0.6691022964509394)

In [None]:
#Best model

In [143]:
tf_log = Pipeline([
    ('tf', TfidfVectorizer(stop_words = 'english', min_df=4)), 
    ('lr', LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=200))
])

tf_log.fit(X_train, y_train)
tf_log.score(X_train, y_train), tf_log.score(X_test, y_test)

(0.9099686847599165, 0.7995824634655533)

In [145]:
t_df = pd.read_csv(r'C:\Users\Dask\Documents\Code\dsi\project5\cyberbullying-classification\data\realdonaldtrump.csv')

In [144]:
preds = tf_log.predict(t_df['content'])
pred_df = pd.DataFrame(preds, columns=['predictions'])
pred_df.value_counts()

predictions        
not_cyberbullying      21152
other_cyberbullying    19700
religion                1499
gender                   421
ethnicity                396
age                      184
dtype: int64