In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression 
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, balanced_accuracy_score, ConfusionMatrixDisplay, RocCurveDisplay, classification_report, precision_score, f1_score, precision_recall_fscore_support
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import BaggingClassifier, VotingClassifier, VotingRegressor
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier

In [None]:
#%pip install scattertext

In [12]:
hate_10k = pd.read_csv('../Data/hate10k_clean.csv')
hate_10k.head(2)

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,RT mayasolovely As a woman you shouldn't comp...
1,3,0,3,0,1,RT mleew17 boy dats cold...tyga dwn bad for c...


**Observation** For the ScatterText to work, you have to convert the classifying categories back into words. This is the code below to do so. 

In [22]:
tweeting = hate_10k.copy()

In [30]:
tweeting['class'] = tweeting['class'].replace(0, 'hate_speech')

In [32]:
tweeting['class'] = tweeting['class'].replace(1, 'offensive_language')

In [34]:
tweeting['class'] = tweeting['class'].replace(2, 'neither')

In [35]:
tweeting

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,neither,RT mayasolovely As a woman you shouldn't comp...
1,3,0,3,0,offensive_language,RT mleew17 boy dats cold...tyga dwn bad for c...
2,3,0,3,0,offensive_language,RT UrKindOfBrand Dawg RT 80sbaby4life You eve...
3,3,0,2,1,offensive_language,RT C_G_Anderson viva_based she look like a tr...
4,6,0,6,0,offensive_language,RT ShenikaRoberts The shit you hear about me ...
...,...,...,...,...,...,...
9995,3,0,3,0,offensive_language,"I ain't trying to fuck, bitch. I just want wings."
9996,6,0,6,0,offensive_language,I aint mad at you bitches thats what hoes do
9997,3,0,3,0,offensive_language,"I aint mad at you, thats what hoes do"
9998,3,0,3,0,offensive_language,I aint never had a prob with no other bitch ov...


In [45]:
tweeting = tweeting.drop(columns=['count', 'hate_speech', 'offensive_language', 'neither'])

In [46]:
tweeting

Unnamed: 0,class,tweet
0,neither,RT mayasolovely As a woman you shouldn't comp...
1,offensive_language,RT mleew17 boy dats cold...tyga dwn bad for c...
2,offensive_language,RT UrKindOfBrand Dawg RT 80sbaby4life You eve...
3,offensive_language,RT C_G_Anderson viva_based she look like a tr...
4,offensive_language,RT ShenikaRoberts The shit you hear about me ...
...,...,...
9995,offensive_language,"I ain't trying to fuck, bitch. I just want wings."
9996,offensive_language,I aint mad at you bitches thats what hoes do
9997,offensive_language,"I aint mad at you, thats what hoes do"
9998,offensive_language,I aint never had a prob with no other bitch ov...


In [None]:
import spacy
import spacy.cli
import scattertext as st

spacy.cli.download("en_core_web_md")

# Neither vs. Hate Speech

In [51]:
#  https://github.com/JasonKessler/scattertext
#  Some code taken from above with the help of Nate Cox


# This section of code generates an interactive scattertext plot and saves it to an html file.

nlp = spacy.load('en_core_web_md')

corpus = st.CorpusFromPandas(
    tweeting,                   # change this (data frame)
    category_col='class',     # change this (column)
    text_col='tweet',              # change this (what you want to look at)
    nlp=nlp
).build()

html = st.produce_scattertext_explorer(
    corpus,
    category='neither',          # change this(category_col value)
    category_name='neither',     # change this(category_col value)
    not_category_name='hate_speech', # change this(category_col value)
    width_in_pixels=1000,
)

open('../hate10k_clean.html', 'w', encoding='utf-8').write(html)

1734661

# Hate Speech vs. Offensive Language

In [52]:
#  https://github.com/JasonKessler/scattertext
#  Some code taken from above with the help of Nate Cox


# This section of code generates an interactive scattertext plot and saves it to an html file.

nlp = spacy.load('en_core_web_md')

corpus = st.CorpusFromPandas(
    tweeting,                   # change this (data frame)
    category_col='class',     # change this (column)
    text_col='tweet',              # change this (what you want to look at)
    nlp=nlp
).build()

html = st.produce_scattertext_explorer(
    corpus,
    category='hate_speech',          # change this(category_col value)
    category_name='hate_speech',     # change this(category_col value)
    not_category_name='offensive_language', # change this(category_col value)
    width_in_pixels=1000,
)

open('../hate10k_clean.html', 'w', encoding='utf-8').write(html)

1734524

# Neither vs. Offensive Language

In [53]:
#  https://github.com/JasonKessler/scattertext
#  Some code taken from above with the help of Nate Cox


# This section of code generates an interactive scattertext plot and saves it to an html file.

nlp = spacy.load('en_core_web_md')

corpus = st.CorpusFromPandas(
    tweeting,                   # change this (data frame)
    category_col='class',     # change this (column)
    text_col='tweet',              # change this (what you want to look at)
    nlp=nlp
).build()

html = st.produce_scattertext_explorer(
    corpus,
    category='neither',          # change this(category_col value)
    category_name='neither',     # change this(category_col value)
    not_category_name='offensive_language', # change this(category_col value)
    width_in_pixels=1000,
)

open('../hate10k_clean.html', 'w', encoding='utf-8').write(html)

1734668