In [1]:
import pandas as pd
import numpy as np 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn. model_selection import train_test_split
from sklearn. tree import DecisionTreeClassifier

In [3]:
def get_top_n_words(corpus, n=None):
    '''
    List the top n words in a vocabulary according to occurrence in a text corpus.
    
    Args:
        corpus (list): a list of text documents.
        n (int): number of top words to return.
    '''
    assert isinstance(corpus, list), "This must be a list!"
    assert isinstance(n, int), "This must be an integer!"

    tfidf_vectorizer = TfidfVectorizer(use_idf=True)
    tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(corpus)
    first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[1]
    df_tfidfvectorizer = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])

    commentsTF_IDF = df_tfidfvectorizer.sort_values(by=["tfidf"],ascending=False)
    return commentsTF_IDF.head(n)



In [4]:

comments = pd.read_csv('data.csv', encoding='utf-8')
df = pd.DataFrame(comments)
df.drop(['Number'], axis=1, inplace=True) # Drop the Number column (cleaning up the data)
vid1 = vid2 = vid3 = vid4 = vid5 = df


## Let's Find the Top 15 Words in Each Video

### Video 1: "Women Should Not Be in Combat Roles: Change My Mind"

In [5]:
vid1 = vid1[vid1.Video == 1]
vid1List = vid1["Comment"].values.tolist()
print(get_top_n_words(vid1List, 15))

             tfidf
more      0.270853
be        0.241720
injured   0.230368
affected  0.230368
would     0.214345
those     0.208332
their     0.206575
woman     0.206575
in        0.190632
being     0.180569
even      0.180569
by        0.180569
than      0.155024
it        0.150067
men       0.124610




### Video 2: "The Problem With Modern Women"

In [6]:
vid2 = df[df.Video == 2]
vid2List = vid1["Comment"].values.tolist()
print(get_top_n_words(vid2List, 15))

             tfidf
more      0.270853
be        0.241720
injured   0.230368
affected  0.230368
would     0.214345
those     0.208332
their     0.206575
woman     0.206575
in        0.190632
being     0.180569
even      0.180569
by        0.180569
than      0.155024
it        0.150067
men       0.124610




### Video 3: "Tucker Carlson Gives CNN Some Tips About Sexism in Hilarious Segment"

In [7]:
vid3 = df[df.Video == 3]
vid3List = vid1["Comment"].values.tolist()
print(get_top_n_words(vid3List, 15))

             tfidf
more      0.270853
be        0.241720
injured   0.230368
affected  0.230368
would     0.214345
those     0.208332
their     0.206575
woman     0.206575
in        0.190632
being     0.180569
even      0.180569
by        0.180569
than      0.155024
it        0.150067
men       0.124610




### Video 4: "WOMAN DEFENDS ANDREW TATE AND ARGUES WITH FEMINISTS AND TRANGENDERS"

In [8]:
vid4 = df[df.Video == 4]
vid4List = vid1["Comment"].values.tolist()
print(get_top_n_words(vid4List, 15))

             tfidf
more      0.270853
be        0.241720
injured   0.230368
affected  0.230368
would     0.214345
those     0.208332
their     0.206575
woman     0.206575
in        0.190632
being     0.180569
even      0.180569
by        0.180569
than      0.155024
it        0.150067
men       0.124610




### Video 5: "Massive Feminist March Against Gender Violence in Rome"

In [9]:
vid5 = df[df.Video == 5]
vid5List = vid5["Comment"].values.tolist()
print(get_top_n_words(vid5List, 15))

               tfidf
their        0.37430
truckers     0.20114
leaders      0.20114
freezing     0.20114
efforts      0.20114
govt         0.20114
ottawa       0.20114
bank         0.20114
canada       0.20114
least        0.20114
arresting    0.20114
fundraising  0.20114
isn          0.20114
accounts     0.20114
associated   0.20114




### Top 15 Words Overall:

In [10]:
df.drop(['Video'], axis=1, inplace=True) # Drop the video column (cleaning up the data)
commentsList = df["Comment"].values.tolist()
print(get_top_n_words(commentsList, 15))

             tfidf
be        0.245173
injured   0.243024
affected  0.243024
more      0.234565
would     0.223824
woman     0.211882
in        0.198536
their     0.189318
even      0.185259
those     0.185259
being     0.175961
than      0.161941
by        0.151469
combat    0.149216
it        0.145994




## Now Let's Detect Hate Speech

In [11]:
import nltk
import re
nltk. download('stopwords')
from nltk. corpus import stopwords
stopword=set(stopwords.words('english'))
stemmer = nltk. SnowballStemmer("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ymorsi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
