# Import Libraries & Tools

In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression 
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, balanced_accuracy_score, ConfusionMatrixDisplay, RocCurveDisplay, classification_report, precision_score, f1_score, precision_recall_fscore_support
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import BaggingClassifier, VotingClassifier, VotingRegressor
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier

In [69]:
#hate_10k['tweet'].str.contains('bitch').map(lambda x : int(x))
#hate_10k.head(55)

# Spacy Test

**Observation** Checking to see what words are currently on the stop words library.

In [70]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

print(len(STOP_WORDS))
print(STOP_WORDS)

326
{'former', 'other', 'sometimes', 'whole', 'third', 'whereas', 'its', 'himself', 'three', 'not', 'thence', 'what', 'be', 'nevertheless', 'does', 'whatever', 'whereafter', 'must', 'now', 'either', 'twelve', '‘s', 'besides', 'i', 'we', "'ll", 'twenty', 'side', 'call', 'nine', 'who', 'off', 'almost', 'every', 'anyway', 'below', 'that', 'to', 'from', 'do', 'everywhere', 'them', 'some', 'nobody', 'sixty', 'few', 'full', 'or', 'seeming', 'for', 'always', 'our', 'beyond', 'thereby', '‘m', 'too', 'go', 'neither', 'you', 'show', 'seem', 'ours', 'this', 'namely', 'wherein', 'afterwards', 'various', 'your', 'nothing', 'otherwise', 'just', 'front', 'herein', 'at', '’d', 'about', 'into', 'first', 'latterly', 'much', 'serious', 'me', 'around', 'next', 'with', '’ll', 'themselves', 'because', 'above', "n't", 'however', 'through', 'already', 'her', '’ve', 'a', 'been', 'something', 'using', 'will', 'get', 'whereby', 'but', 'across', 'they', 'and', 'several', 'should', 'last', "'ve", 'via', 'perhaps',

**Observation** Just testing out SpaCy before I apply my data frame to it. 

In [96]:
import spacy
nlp = spacy.load('en_core_web_md')

sentence = nlp("We will go to movie after the dinner")
print(sentence)

notStopWords = [notStopWords.text for notStopWords in sentence if not notStopWords.is_stop]
print(notStopWords)

stopWords = [stopWords.text for stopWords in sentence if stopWords.is_stop]
print(stopWords)

We will go to movie after the dinner
['movie', 'dinner']
['We', 'will', 'go', 'to', 'after', 'the']


In [97]:
import pandas as pd

In [98]:
hate10k = pd.read_csv('../Data/hate10k_clean.csv')
hate10k

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,RT mayasolovely As a woman you shouldn't comp...
1,3,0,3,0,1,RT mleew17 boy dats cold...tyga dwn bad for c...
2,3,0,3,0,1,RT UrKindOfBrand Dawg RT 80sbaby4life You eve...
3,3,0,2,1,1,RT C_G_Anderson viva_based she look like a tr...
4,6,0,6,0,1,RT ShenikaRoberts The shit you hear about me ...
...,...,...,...,...,...,...
9995,3,0,3,0,1,"I ain't trying to fuck, bitch. I just want wings."
9996,6,0,6,0,1,I aint mad at you bitches thats what hoes do
9997,3,0,3,0,1,"I aint mad at you, thats what hoes do"
9998,3,0,3,0,1,I aint never had a prob with no other bitch ov...


In [99]:
tweets_txt = hate10k.drop(columns=['count', 'hate_speech', 'offensive_language', 'neither', 'class'])

In [100]:
type(tweets_txt)

pandas.core.frame.DataFrame

**Observation** I have to turn my tweet data into a text file. So, I isolated the tweets and put them together with only the tab spacing. 

In [101]:
tweets_txt

Unnamed: 0,tweet
0,RT mayasolovely As a woman you shouldn't comp...
1,RT mleew17 boy dats cold...tyga dwn bad for c...
2,RT UrKindOfBrand Dawg RT 80sbaby4life You eve...
3,RT C_G_Anderson viva_based she look like a tr...
4,RT ShenikaRoberts The shit you hear about me ...
...,...
9995,"I ain't trying to fuck, bitch. I just want wings."
9996,I aint mad at you bitches thats what hoes do
9997,"I aint mad at you, thats what hoes do"
9998,I aint never had a prob with no other bitch ov...


In [77]:
#Must convert tweets to txt file to use Spacy
#tweets_txt.to_csv('tweet_txt.txt', index=False)

# Testing Out Spacy

In [78]:
tweet = pd.read_csv('../Data/tweet_txt.txt', sep="\t")
tweet

Unnamed: 0,tweet
0,RT mayasolovely As a woman you shouldn't comp...
1,RT mleew17 boy dats cold...tyga dwn bad for c...
2,RT UrKindOfBrand Dawg RT 80sbaby4life You eve...
3,RT C_G_Anderson viva_based she look like a tr...
4,RT ShenikaRoberts The shit you hear about me ...
...,...
9995,"I ain't trying to fuck, bitch. I just want wings."
9996,I aint mad at you bitches thats what hoes do
9997,"I aint mad at you, thats what hoes do"
9998,I aint never had a prob with no other bitch ov...


In [105]:
stops = ['pussy', 'bitch']
nlp = spacy.blank("en")
ruler = nlp.add_pipe("entity_ruler")
letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
patterns = []
for tweet in tweets:
    #patterns.append({"label": 'bitch', 'pattern': tweet})
    patterns.append({"label": 'gay', 'pattern': tweet})
    if tweet in stops:
        pass
    for l in letters:
        patterns.append({"label": 'gay', 'pattern': tweet + f".{l}"})
ruler.add_patterns(patterns)

In [106]:
text = str(tweets)
text[:2000]

'[" RT mayasolovely As a woman you shouldn\'t complain about cleaning up your house. &amp; as a man you should always take the trash out...", \' RT mleew17 boy dats cold...tyga dwn bad for cuffin dat hoe in the 1st place\', \' RT UrKindOfBrand Dawg RT 80sbaby4life You ever fuck a bitch and she start to cry? You be confused as shit\', \' RT C_G_Anderson viva_based she look like a tranny\', \' RT ShenikaRoberts The shit you hear about me might be true or it might be faker than the bitch who told it to ya &#57361;\', \'"T_Madison_x The shit just blows me..claim you so faithful and down for somebody but still fucking with hoes &#128514;&#128514;&#128514;"\', \'"__BrighterDays I can not just sit up and HATE on another bitch .. I got too much shit going on"\', "&#8220;selfiequeenbri cause I\'m tired of you big bitches coming for us skinny girls&#8221;", \'" &amp; you might not get ya bitch back &amp; thats that "\', \'" rhythmixx_ hobbies include fighting Mariam"\\n\\nbitch\', \'" Keeks is a

In [107]:
from spacy import displacy

In [108]:
#Extracting out words from text

doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_ )
displacy.render(doc, style="ent")

"T_Madison_x The shit just blows me..claim you so faithful and down for somebody but still fucking with hoes &#128514;&#128514;&#128514;" gay
"__BrighterDays I can not just sit up and HATE on another bitch .. I got too much shit going on" gay
&#8220;selfiequeenbri cause I'm tired of you big bitches coming for us skinny girls&#8221; gay
" &amp; you might not get ya bitch back &amp; thats that " gay
" Keeks is a bitch she curves everyone " lol I walked into a conversation like this. Smh gay
" Murda Gang bitch its Gang Land " gay
" So hoes that smoke are losers ? " yea ... go on IG gay
" bad bitches is the only thing that i like " gay
" bitch get up off me " gay
" bitch nigga miss me with it " gay
" bitch plz whatever " gay
" bitch who do you love " gay
" bitches get cut off everyday B " gay
" black bottle &amp; a bad bitch " gay
" broke bitch cant tell me nothing " gay
" cancel that bitch like Nino " gay
" cant you see these hoes wont change " gay
" fuck no that bitch dont even suck dick

# Matcher

In [84]:
from spacy.matcher import Matcher

In [109]:
nlp = spacy.load("en_core_web_md")

In [None]:
#What do you want to find within the text
matcher = Matcher(nlp.vocab)
pattern = [{"#Thing I am looking for": True}]
matcher.add("gay", [pattern])

In [None]:
doc = nlp("gay")
matches = matches(doc)


In [None]:
print(matches)

In [None]:
print(nlp.vocab[matches[0][0]].text)

In [92]:
text = str(text)
text[:2000]

'[" RT mayasolovely As a woman you shouldn\'t complain about cleaning up your house. &amp; as a man you should always take the trash out...", \' RT mleew17 boy dats cold...tyga dwn bad for cuffin dat hoe in the 1st place\', \' RT UrKindOfBrand Dawg RT 80sbaby4life You ever fuck a bitch and she start to cry? You be confused as shit\', \' RT C_G_Anderson viva_based she look like a tranny\', \' RT ShenikaRoberts The shit you hear about me might be true or it might be faker than the bitch who told it to ya &#57361;\', \'"T_Madison_x The shit just blows me..claim you so faithful and down for somebody but still fucking with hoes &#128514;&#128514;&#128514;"\', \'"__BrighterDays I can not just sit up and HATE on another bitch .. I got too much shit going on"\', "&#8220;selfiequeenbri cause I\'m tired of you big bitches coming for us skinny girls&#8221;", \'" &amp; you might not get ya bitch back &amp; thats that "\', \'" rhythmixx_ hobbies include fighting Mariam"\\n\\nbitch\', \'" Keeks is a

In [93]:
nlp = spacy.load("en_core_web_md")

In [112]:
#The Extraction
#In this example, I am extracting pronouns

matcher = Matcher(nlp.vocab)
pattern = [{'LEMMA': "gay"}]
matcher.add("gay", [pattern])
doc = nlp(text)
matches = matcher(doc)

print(len(matches))
for match in matches[:1000]:
    print(match, doc[match[1]:match[2]])


65
(5754201015754630361, 964, 965) Gay
(5754201015754630361, 2164, 2165) gay
(5754201015754630361, 8964, 8965) gay
(5754201015754630361, 12926, 12927) gay
(5754201015754630361, 14749, 14750) gay
(5754201015754630361, 17061, 17062) gay
(5754201015754630361, 23020, 23021) gay
(5754201015754630361, 26854, 26855) gay
(5754201015754630361, 31171, 31172) gay
(5754201015754630361, 31628, 31629) gay
(5754201015754630361, 37781, 37782) gay
(5754201015754630361, 41279, 41280) gay
(5754201015754630361, 49181, 49182) gay
(5754201015754630361, 49706, 49707) gays
(5754201015754630361, 54496, 54497) gay
(5754201015754630361, 57593, 57594) gay
(5754201015754630361, 62176, 62177) gay
(5754201015754630361, 62824, 62825) gay
(5754201015754630361, 71823, 71824) gay
(5754201015754630361, 72235, 72236) gay
(5754201015754630361, 74291, 74292) gay
(5754201015754630361, 75638, 75639) gay
(5754201015754630361, 77572, 77573) gay
(5754201015754630361, 82647, 82648) gay
(5754201015754630361, 85268, 85269) gay
(575

In [113]:
# To get multi phrases

matcher = Matcher(nlp.vocab)
pattern = [{'LEMMA': "gay", "OP": "+"}]
matcher.add("gay", [pattern])
doc = nlp(text)
matches = matcher(doc)

print(len(matches))
for match in matches[:1000]:
    print(match, doc[match[1]:match[2]])


68
(5754201015754630361, 964, 965) Gay
(5754201015754630361, 2164, 2165) gay
(5754201015754630361, 8964, 8965) gay
(5754201015754630361, 12926, 12927) gay
(5754201015754630361, 14749, 14750) gay
(5754201015754630361, 17061, 17062) gay
(5754201015754630361, 23020, 23021) gay
(5754201015754630361, 26854, 26855) gay
(5754201015754630361, 31171, 31172) gay
(5754201015754630361, 31628, 31629) gay
(5754201015754630361, 37781, 37782) gay
(5754201015754630361, 41279, 41280) gay
(5754201015754630361, 49181, 49182) gay
(5754201015754630361, 49706, 49707) gays
(5754201015754630361, 54496, 54497) gay
(5754201015754630361, 57593, 57594) gay
(5754201015754630361, 62176, 62177) gay
(5754201015754630361, 62824, 62825) gay
(5754201015754630361, 71823, 71824) gay
(5754201015754630361, 72235, 72236) gay
(5754201015754630361, 74291, 74292) gay
(5754201015754630361, 75638, 75639) gay
(5754201015754630361, 77572, 77573) gay
(5754201015754630361, 82647, 82648) gay
(5754201015754630361, 85268, 85269) gay
(575

In [None]:
#Tuning by extra layer by adding LONGEST

matcher = Matcher(nlp.vocab)
pattern = [{'LEMMA': "gay", "OP": "+"}]
matcher.add("gay", [pattern], greedy="LONGEST")
doc = nlp(text)
matches = matcher(doc)

print(len(matches))
for match in matches[:1000]:
    print(match, doc[match[1]:match[2]])


In [None]:
#To sort 

matcher = Matcher(nlp.vocab)
pattern = [{'LEMMA': "gay", "OP": "+"}]
matcher.add("gay", [pattern], greedy="LONGEST")
doc = nlp(text)
matches = matcher(doc)
matches.sort(key= lambda x: x[1])

print(len(matches))
for match in matches[:1000]:
    print(match, doc[match[1]:match[2]])

In [None]:
#Tuning by adding a verb or other langaugistic features

matcher = Matcher(nlp.vocab)
pattern = [{'POS': "VERB", "OP": "+"}, {"POS":"VERB"}]
matcher.add("VERB", [pattern], greedy="LONGEST")
doc = nlp(text)
matches = matcher(doc)
matches.sort(key= lambda x: x[1])

print(len(matches))
for match in matches[:1000]:
    print(match, doc[match[1]:match[2]])

In [None]:
#For Spacy, clean you text as much as possible because quotations and other symbols 
# because it can throw off your data

In [114]:
#Tuned to truly extract data

matcher = Matcher(nlp.vocab)
pattern = [{'POS': "PROPN", "OP": "+"}, 
            {"POS":"PROPN"},
            {"IS_ALPHA": True, "OP": "+"},
            {"IS_PUNCT": True, "OP": "*"},
            {"ORTH": "'"}
            ]
matcher.add("PROPN", [pattern], greedy="LONGEST")
doc = nlp(text)
matches = matcher(doc)
matches.sort(key= lambda x: x[1])

print(len(matches))
for match in matches[:1000]:
    print(match, doc[match[1]:match[2]])

606
(96, 250, 260) Murda Gang bitch its Gang Land "', '
(96, 562, 572) Murder Game pussy nigga shut up "', '
(96, 928, 939) Pill Chamberlain these bitches love my music "', '
(96, 944, 960) Teanna Trump probably cleaner than most of these twitter hoes but........."', '
(96, 1282, 1294) emoji doe?"y he say she looked like scream lmao', '
(96, 1407, 1413) Damn Skippy lol', '
(96, 1543, 1551) niggas act like bitches..', '
(96, 3220, 3238) KelsieBelsi You\'re not a man if you refer to every girl as a bitch"', '
(96, 3392, 3407) KissMySmilee Don\'t got time for bitches to be actin iffy."', '
(96, 4006, 4012) south america bitch', '
(96, 4022, 4031) toto santi is like nasty pussy', '
(96, 4629, 4640) dis bitch in da wrong tree bro.', '
(96, 5087, 5101) SmeegBaby Dese hoes be LYIN to all of us nigga"', '
(96, 6004, 6018) marackaf aye b you pullin hoes just like o taught you', '
(96, 6160, 6167) ya homeboy fucc nicca', '
(96, 8834, 8841) JUDYANN\'S SO PRETTY', '
(96, 9539, 9547) fuckin yo bitc