In [140]:
import pandas as pd
import numpy as np 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn. model_selection import train_test_split
from sklearn. tree import DecisionTreeClassifier

In [141]:
def get_top_n_words(corpus, n=None):
    '''
    List the top n words in a vocabulary according to occurrence in a text corpus.
    
    Args:
        corpus (list): a list of text documents.
        n (int): number of top words to return.
    '''
    assert isinstance(corpus, list), "This must be a list!"
    assert isinstance(n, int), "This must be an integer!"

    tfidf_vectorizer = TfidfVectorizer(use_idf=True)
    tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(corpus)
    first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[1]
    df_tfidfvectorizer = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])

    commentsTF_IDF = df_tfidfvectorizer.sort_values(by=["tfidf"],ascending=False)
    return commentsTF_IDF.head(n)



In [143]:

comments = pd.read_csv('data/data.csv', encoding='utf-8')
df = pd.DataFrame(comments)
df.drop(['Number'], axis=1, inplace=True) # Drop the Number column (cleaning up the data)
vid1 = vid2 = vid3 = vid4 = vid5 = df


## Let's Find the Top 15 Words in Each Video

### Video 1: "Women Should Not Be in Combat Roles: Change My Mind"

In [144]:
vid1 = vid1[vid1.Video == 1]
vid1List = vid1["Comment"].values.tolist()
print(get_top_n_words(vid1List, 15))

             tfidf
more      0.270853
be        0.241720
injured   0.230368
affected  0.230368
would     0.214345
those     0.208332
their     0.206575
woman     0.206575
in        0.190632
being     0.180569
even      0.180569
by        0.180569
than      0.155024
it        0.150067
men       0.124610




### Video 2: "The Problem With Modern Women"

In [145]:
vid2 = df[df.Video == 2]
vid2List = vid1["Comment"].values.tolist()
print(get_top_n_words(vid2List, 15))

             tfidf
more      0.270853
be        0.241720
injured   0.230368
affected  0.230368
would     0.214345
those     0.208332
their     0.206575
woman     0.206575
in        0.190632
being     0.180569
even      0.180569
by        0.180569
than      0.155024
it        0.150067
men       0.124610




### Video 3: "Tucker Carlson Gives CNN Some Tips About Sexism in Hilarious Segment"

In [146]:
vid3 = df[df.Video == 3]
vid3List = vid1["Comment"].values.tolist()
print(get_top_n_words(vid3List, 15))

             tfidf
more      0.270853
be        0.241720
injured   0.230368
affected  0.230368
would     0.214345
those     0.208332
their     0.206575
woman     0.206575
in        0.190632
being     0.180569
even      0.180569
by        0.180569
than      0.155024
it        0.150067
men       0.124610




### Video 4: "WOMAN DEFENDS ANDREW TATE AND ARGUES WITH FEMINISTS AND TRANGENDERS"

In [147]:
vid4 = df[df.Video == 4]
vid4List = vid1["Comment"].values.tolist()
print(get_top_n_words(vid4List, 15))

             tfidf
more      0.270853
be        0.241720
injured   0.230368
affected  0.230368
would     0.214345
those     0.208332
their     0.206575
woman     0.206575
in        0.190632
being     0.180569
even      0.180569
by        0.180569
than      0.155024
it        0.150067
men       0.124610




### Video 5: "Massive Feminist March Against Gender Violence in Rome"

In [148]:
vid5 = df[df.Video == 5]
vid5List = vid5["Comment"].values.tolist()
print(get_top_n_words(vid5List, 15))

               tfidf
their        0.37430
truckers     0.20114
leaders      0.20114
freezing     0.20114
efforts      0.20114
govt         0.20114
ottawa       0.20114
bank         0.20114
canada       0.20114
least        0.20114
arresting    0.20114
fundraising  0.20114
isn          0.20114
accounts     0.20114
associated   0.20114




### Top 15 Words Overall:

In [149]:
df.drop(['Video'], axis=1, inplace=True) # Drop the video column (cleaning up the data)
commentsList = df["Comment"].values.tolist()
print(get_top_n_words(commentsList, 15))

             tfidf
be        0.245173
injured   0.243024
affected  0.243024
more      0.234565
would     0.223824
woman     0.211882
in        0.198536
their     0.189318
even      0.185259
those     0.185259
being     0.175961
than      0.161941
by        0.151469
combat    0.149216
it        0.145994




## Now Let's Detect Hate Speech

In [150]:
import nltk
import re
import string
nltk. download('stopwords')
from nltk. corpus import stopwords
from sklearn. metrics import accuracy_score
stopword=set(stopwords.words('english'))
stemmer = nltk. SnowballStemmer("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ymorsi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [152]:
data = pd. read_csv("data/labeled_data.csv")
print(data. head())

   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  !!! RT @mayasolovely: As a woman you shouldn't...  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  


### Let's Pre-Process the Data

In [153]:
data["labels"] = data["class"]. map({0: "Hate Speech", 1: "Offensive Speech", 2: "No Hate and Offensive Speech"})
data = data[["tweet", "labels"]]
print(data. head())

                                               tweet  \
0  !!! RT @mayasolovely: As a woman you shouldn't...   
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...   
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...   
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...   
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...   

                         labels  
0  No Hate and Offensive Speech  
1              Offensive Speech  
2              Offensive Speech  
3              Offensive Speech  
4              Offensive Speech  


In [154]:
def clean(text):
    text = str(text). lower()
    text = re.sub('[.?]', '', text) 
    text = re.sub('https?://\S+|www.\S+', '', text)
    text = re.sub('<.?>+', '', text)
    text = re.sub('[%s]' % re. escape(string. punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w\d\w', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text = " ".join(text)
    text = [stemmer. stem(word) for word in text. split(' ')]
    text = " ".join(text)
    return text

In [155]:
data["tweet"].apply(clean)

0         rt mayasolov woman shouldnt complain clean ho...
1         rt mlee boy dat coldtyga dwn bad cuffin dat h...
2         rt urkindofbrand dawg rt babif ever fuck bitc...
3                   rt cganderson vivabas look like tranni
4         rt shenikarobert shit hear might true might f...
                               ...                        
24778    yous muthafin lie 0lifeask earl coreyemanuel r...
24779    youv gone broke wrong heart babi drove redneck...
24780    young buck wanna eat dat nigguh like aint fuck...
24781                       youu got wild bitch tellin lie
24782    ruffl  ntac eileen dahlia  beauti color combin...
Name: tweet, Length: 24783, dtype: object

In [156]:
x = np.array(data["tweet"])
y = np.array(data["labels"])
cv = CountVectorizer()
X = cv.fit_transform(x)
# Splitting the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [97]:
#Model building
model = DecisionTreeClassifier()
#Training the model
model.fit(X_train,y_train)

DecisionTreeClassifier()

In [98]:
#Testing the model
y_pred = model.predict(X_test)
y_pred


array(['Offensive Speech', 'Offensive Speech', 'Offensive Speech', ...,
       'No Hate and Offensive Speech', 'No Hate and Offensive Speech',
       'Offensive Speech'], dtype=object)

In [99]:
#Accuracy Score of our model
print(accuracy_score(y_test,y_pred))


0.8911847414109304


### The Accuracy Score Tells Us Our Model is 89% Accurate

## Now Let's Run the Model on Our Comments

In [134]:
binClfr = []
numHate = 0
for i in range(len(commentsList)):
    inp = cv.transform([commentsList[i]]).toarray()
    if (model.predict(inp) == ['Offensive Speech']):
        binClfr.append(1) # add one if offensive
        numHate += 1
    elif (model.predict(inp) == ['No Hate and Offensive Speech']):
        binClfr.append(0) # add zero if comment is not hate speech
    else:
        binClfr.append(9) # Add 9 if output it neither (shouldn't happen; means that there's an error)
    print(model.predict(inp))

['Offensive Speech']
['Offensive Speech']
['Offensive Speech']
['Offensive Speech']
['Offensive Speech']
['No Hate and Offensive Speech']
['Offensive Speech']
['No Hate and Offensive Speech']
['No Hate and Offensive Speech']
['No Hate and Offensive Speech']
['No Hate and Offensive Speech']
['Offensive Speech']
['No Hate and Offensive Speech']
['No Hate and Offensive Speech']
['No Hate and Offensive Speech']
['Offensive Speech']
['No Hate and Offensive Speech']
['Offensive Speech']
['No Hate and Offensive Speech']
['No Hate and Offensive Speech']
['No Hate and Offensive Speech']
['No Hate and Offensive Speech']
['No Hate and Offensive Speech']
['No Hate and Offensive Speech']
['Offensive Speech']
['No Hate and Offensive Speech']
['No Hate and Offensive Speech']
['Offensive Speech']
['Offensive Speech']
['No Hate and Offensive Speech']
['No Hate and Offensive Speech']
['Hate Speech']
['No Hate and Offensive Speech']
['No Hate and Offensive Speech']
['No Hate and Offensive Speech']
['No H

In [135]:
print(binClfr)

[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 9, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 9, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 9, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 9, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0]


In [139]:
print(numHate)
print(len(binClfr))

print("Percentage of hate speech comments: " + str(numHate/len(binClfr)))

65
250
Percentage of hate speech comments: 0.26


#### This Allows Us to Conclude That Over One Quarter of the Comments of the Top 50 Comments on Each Video Contain Hate Speech