# Project to predict the popularity of Political leaders and detect hate speech and tribalism from Facebook and Twitter comments

## Imports

In [14]:
from webdriver_manager.chrome import ChromeDriverManager

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

from bs4 import BeautifulSoup

import pandas as pd

import time

# for tweets collection
import tweepy

# text cleaning
import preprocessor as p

# text preprocessing
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet

#lemmatization
from nltk.stem import WordNetLemmatizer

#sentiment analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.tag import pos_tag

## visualization
import matplotlib.pyplot as plt
%matplotlib inline

# get the language of the text
import langdetect
#for ner recognition
import spacy

from langdetect.lang_detect_exception import LangDetectException

#Hatespeech imports
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from wordcloud import WordCloud

#to data preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

#NLP tools
import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

#train split and fit models
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
  
#model selection
from sklearn.metrics import confusion_matrix, accuracy_score


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# GET FACEBOOK COMMENTS

In [15]:
# scroll down
def scroll():
    pop_out_btn = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "popup_xout")))
    pop_out_btn.click()

In [16]:
# funtion to click the view more button
def view_more_click(driver):
    try: 
        view_more_comments = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.LINK_TEXT, "View more comments…")))
        view_more_comments.click()
    
    except TimeoutException:
        view_more_comments = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.LINK_TEXT, "View previous comments…")))
        view_more_comments.click()
        
    except ElementClickInterceptedException:
        scroll()
    
    except (NoSuchElementException,ElementClickInterceptedException, StaleElementReferenceException) as es :
        print(es)
    html = driver.page_source
    return html

# function to click the button 50 times
def view_more_comm(driver):
    i = 0
    while i < 50:
        html = view_more_click(driver)
        time.sleep(3)
        i += 1
    return html

In [17]:
# function to get a list of all the comment elements
def get_comments(html):
    soup = BeautifulSoup(html, 'html.parser')
    comment_section = soup.find("div", {"class":"_59e9 _1-ut _2a_g _34oh" })
    comment_els = comment_section.find_all("div", {"class":"_2a_i"})
    
    master_list = []
    for c in comment_els:
        data_dict = {}
        data_dict["User"] = c.find("div", {"class":"_2b05"}).text
        data_dict["Comment"] = c.find("div", {"data-sigil":"comment-body"}).text
        master_list.append(data_dict)
    return master_list

    
    

# GETTING TWITTER DATA

## Twitter Authentication

In [18]:
# read the keys from the file 
def authorize_twitter():
    keys = []
    with open('../Keys.txt') as f:
        for line in f:
            keys.append(line.strip())

    API_KEY = keys[1]
    API_KEY_SECRET = keys [4]
    ACCESS_TOKEN = keys[10]
    ACCESS_TOKEN_SECRET = keys[13]
    
    # initialize the api
    auth = tweepy.OAuthHandler(API_KEY, API_KEY_SECRET)
    auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
    api = tweepy.API(auth)
    
    return (api)

## Get the replies from the posts

In [19]:
def get_replies(url):
    name = url.split("/")[-3]
    tweet_id = url.split("/")[-1]

    #empty list for the replies
    replies = []
    
    api = authorize_twitter()
    # get replies from the tweet
    for tweet in tweepy.Cursor(api.search_tweets,q='to:'+name, result_type='recent').items(1000):
        if hasattr(tweet, 'in_reply_to_status_id_str'):
            if (tweet.in_reply_to_status_id_str==tweet_id):
                replies.append(tweet)
    
    #master list to hold all the data needed
    master_list = []
    for reply in replies:
        data_dict = {}
        data_dict["User"] = reply.author.screen_name
        data_dict["Comment"] = reply.text

        master_list.append(data_dict)
        
    return (master_list)

# TEXT PREPROCESSING

## Tokenization and POS tagging + removing stop words

In [20]:
#perform tokenization and pos tagging and remove stop words
pos_dict = {'J': wordnet.ADJ, 'v': wordnet.VERB, 'N': wordnet.NOUN, 'R': wordnet.ADV}
def token_stop_pos(text):
    tags = pos_tag(word_tokenize(text))
    newlist = []
    
    for word, tag in tags:
        if word.lower() not in set(stopwords.words('english')):
            newlist.append(tuple([word, pos_dict.get(tag[0])]))
            
    return newlist

## Lemmatization

In [21]:
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatize(pos_data):
    lemma_rew = " "
    for word, pos in pos_data:
        if not pos:
            lemma = word
            lemma_rew = lemma_rew + " " + lemma
        else:
            lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
            lemma_rew = lemma_rew + " " + lemma
    return lemma_rew


## Language detection

In [22]:
def get_language(text):
    try:
        lang = langdetect.detect(text)
        
        return lang
    except LangDetectException:
        print(text, "is invalid")

## SENTIMENT ANALYSIS USING VADER

In [23]:
def vaderSentimentAnalysis(text):
    analyzer = SentimentIntensityAnalyzer()
    vs= analyzer.polarity_scores(text)
    return vs['compound']

# function to analyse 
def vader_analysis(compound):
    if compound > 0:
        return 'Positive'
    elif compound < 0 :
        return 'Negative'
    else:
        return 'Neutral'

## Get names and parties from the text

In [24]:
def get_parties(text):
    for i in range(len(text)):
        parties=[]
        tags = text[i][1]
        if tags == "POLITICAL_PARTY":
            party = text[i][0]
            if party not in parties:
                parties.append(party)
        else:
            continue
        return parties

In [25]:
def get_names(text):
    names=[]
    for i in range(len(text)):
        tags = text[i][1]
        if tags == "POLITICIAN":
            name = text[i][0]
            if name not in names:
                names.append(name)
        else:
            continue
    return names

# MAIN FILE

In [26]:

choice = input("Please select the choice of your data. \n 1. for facebook. \n 2. For twitter")
if choice == "1":
    # install webdriver
    driver = webdriver.Chrome(ChromeDriverManager().install())
    url = input("Paste the Facebook mobile url: ")
    driver.get(url)

    print("Getting comments please wait...\n")
    html = view_more_comm(driver)

    master_list = get_comments(html)

    print(str(len(master_list)) + " Comments found")
        
else:
    url = input("Paste the tweet url: ")
    print("----Getting replies please wait------")
    master_list = get_replies(url)
    print(str(len(master_list)) + " Comments found") 


Please select the choice of your data. 
 1. for facebook. 
 2. For twitter 1




Current google-chrome version is 99.0.4844
Get LATEST chromedriver version for 99.0.4844 google-chrome
Driver [C:\Users\User\.wdm\drivers\chromedriver\win32\99.0.4844.51\chromedriver.exe] found in cache
  driver = webdriver.Chrome(ChromeDriverManager().install())


Paste the Facebook mobile url:  https://mobile.facebook.com/story.php?story_fbid=10160054567584430&id=88201339429&m_entstream_source=timeline


Getting comments please wait...

1470 Comments found


In [27]:
stemmer = nltk.SnowballStemmer("english")
stopword=set(stopwords.words('english'))
def clean_text(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text

In [28]:
    #create a dataframe and clean the text
df = pd.DataFrame(master_list)
df["clean_tweets"] = df["Comment"].apply(lambda x: clean_text(x))
df

Unnamed: 0,User,Comment,clean_tweets
0,Jibril Iron Shaffi,It must be a special helicopter just like the ...,must special helicopt like standard gaug railw...
1,Fieldmarshal Newton Mwaura,"Tell that to the birds, very ignorant post and...",tell bird ignor post cost narrat
2,Khalif Kairo,Media loves overblowing stuff😅 Blades pekee fo...,media love overblow stuff😅 blade peke chopper ...
3,Mutaih Vinneous,Si waliiba condoms pale KEMSA. Wauze na watumi...,si waliiba condom pale kemsa wauz na watumi hi...
4,Simon Kanari,"If the insurance company is refusing to pay, t...",insur compani refus pay deep state
...,...,...,...
1465,James Adeti,Kindly visit me at grogoni I fix it with less ...,kind visit grogoni fix less million
1466,Jamaa Jama,Weka polythene paper while you look for money .,weka polythen paper look money
1467,Cindy Nyar-Alego Siaya,"Hamna any news ,am tired with this ya helicopt...",hamna news tire ya helicopt whole stori
1468,Pepe Danson Ongondi,How Kalenjingas dare to throw stones to a chop...,kalenjinga dare throw stone chopper exchang t...


## Text Preprocessing

In [29]:
nltk.download('averaged_perceptron_tagger')
df['POS tagged'] = df['clean_tweets'].apply(lambda x: token_stop_pos(x))
df

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Unnamed: 0,User,Comment,clean_tweets,POS tagged
0,Jibril Iron Shaffi,It must be a special helicopter just like the ...,must special helicopt like standard gaug railw...,"[(must, None), (special, a), (helicopt, n), (l..."
1,Fieldmarshal Newton Mwaura,"Tell that to the birds, very ignorant post and...",tell bird ignor post cost narrat,"[(tell, None), (bird, n), (ignor, n), (post, n..."
2,Khalif Kairo,Media loves overblowing stuff😅 Blades pekee fo...,media love overblow stuff😅 blade peke chopper ...,"[(media, n), (love, None), (overblow, a), (stu..."
3,Mutaih Vinneous,Si waliiba condoms pale KEMSA. Wauze na watumi...,si waliiba condom pale kemsa wauz na watumi hi...,"[(si, n), (waliiba, n), (condom, n), (pale, a)..."
4,Simon Kanari,"If the insurance company is refusing to pay, t...",insur compani refus pay deep state,"[(insur, n), (compani, n), (refus, a), (pay, n..."
...,...,...,...,...
1465,James Adeti,Kindly visit me at grogoni I fix it with less ...,kind visit grogoni fix less million,"[(kind, n), (visit, n), (grogoni, n), (fix, n)..."
1466,Jamaa Jama,Weka polythene paper while you look for money .,weka polythen paper look money,"[(weka, n), (polythen, None), (paper, n), (loo..."
1467,Cindy Nyar-Alego Siaya,"Hamna any news ,am tired with this ya helicopt...",hamna news tire ya helicopt whole stori,"[(hamna, a), (news, n), (tire, n), (ya, n), (h..."
1468,Pepe Danson Ongondi,How Kalenjingas dare to throw stones to a chop...,kalenjinga dare throw stone chopper exchang t...,"[(kalenjinga, n), (dare, None), (throw, None),..."


In [30]:
# Lemmatization
df['Lemma'] = df['POS tagged'].apply(lambda x: lemmatize(x))
df.head()

Unnamed: 0,User,Comment,clean_tweets,POS tagged,Lemma
0,Jibril Iron Shaffi,It must be a special helicopter just like the ...,must special helicopt like standard gaug railw...,"[(must, None), (special, a), (helicopt, n), (l...",must special helicopt like standard gaug rai...
1,Fieldmarshal Newton Mwaura,"Tell that to the birds, very ignorant post and...",tell bird ignor post cost narrat,"[(tell, None), (bird, n), (ignor, n), (post, n...",tell bird ignor post cost narrat
2,Khalif Kairo,Media loves overblowing stuff😅 Blades pekee fo...,media love overblow stuff😅 blade peke chopper ...,"[(media, n), (love, None), (overblow, a), (stu...",medium love overblow stuff😅 blade peke chopp...
3,Mutaih Vinneous,Si waliiba condoms pale KEMSA. Wauze na watumi...,si waliiba condom pale kemsa wauz na watumi hi...,"[(si, n), (waliiba, n), (condom, n), (pale, a)...",si waliiba condom pale kemsa wauz na watumi ...
4,Simon Kanari,"If the insurance company is refusing to pay, t...",insur compani refus pay deep state,"[(insur, n), (compani, n), (refus, a), (pay, n...",insur compani refus pay deep state


## Get only english text

In [31]:
df['language'] = df['clean_tweets'].apply(lambda x: get_language(x) if x.strip()!="" else "")
df = df[df['language']=='en'].reset_index(drop=True)
df.drop('language', axis=1)
df

😮😮😮😮 is invalid
🤣😂🤣 is invalid
☑️ is invalid
🙌🙌 is invalid
😂😂😂😂😍😍😍 is invalid


Unnamed: 0,User,Comment,clean_tweets,POS tagged,Lemma,language
0,Jibril Iron Shaffi,It must be a special helicopter just like the ...,must special helicopt like standard gaug railw...,"[(must, None), (special, a), (helicopt, n), (l...",must special helicopt like standard gaug rai...,en
1,Khalif Kairo,Media loves overblowing stuff😅 Blades pekee fo...,media love overblow stuff😅 blade peke chopper ...,"[(media, n), (love, None), (overblow, a), (stu...",medium love overblow stuff😅 blade peke chopp...,en
2,Tony Ways,That should not be an issue think of improvin...,issu think improv high standard live economi...,"[(issu, n), (think, None), (improv, r), (high,...",issu think improv high standard live economi...,en
3,Grace Waithira,"Insurance does not pay for Campaign incidents,...",insur pay campaign incid 🔥 stone throw reckle...,"[(insur, a), (pay, n), (campaign, n), (incid, ...",insur pay campaign incid 🔥 stone throw reckl...,en
4,James Koech,The INSURANCE company were refusing to PAY for...,insur compani refus pay replac say fault raila...,"[(insur, n), (compani, n), (refus, a), (pay, n...",insur compani refus pay replac say fault rai...,en
...,...,...,...,...,...,...
408,James D'mosh,You mean the price of Baba's chopper wind wind...,mean price baba chopper wind wind shield wort...,"[(mean, a), (price, n), (baba, n), (chopper, n...",mean price baba chopper wind wind shield wor...,en
409,Paul Gayo,All most the same price with Abdulnasir car wi...,price abdulnasir car windscreen,"[(price, n), (abdulnasir, n), (car, n), (winds...",price abdulnasir car windscreen,en
410,Jamaa Jama,Weka polythene paper while you look for money .,weka polythen paper look money,"[(weka, n), (polythen, None), (paper, n), (loo...",weka polythen paper look money,en
411,Cindy Nyar-Alego Siaya,"Hamna any news ,am tired with this ya helicopt...",hamna news tire ya helicopt whole stori,"[(hamna, a), (news, n), (tire, n), (ya, n), (h...",hamna news tire ya helicopt whole stori,en


## Geta clean dataframe for text analysis

In [32]:
fin_data= pd.DataFrame(df[['Comment', 'Lemma']])

## Sentiment Analysis

In [33]:
fin_data['Vader_Sentiment'] = fin_data['Comment'].apply(lambda x: vaderSentimentAnalysis(x))
fin_data['Vader_Analysis'] = fin_data['Vader_Sentiment'].apply(lambda x: vader_analysis(x))
fin_data[1:50]

Unnamed: 0,Comment,Lemma,Vader_Sentiment,Vader_Analysis
1,Media loves overblowing stuff😅 Blades pekee fo...,medium love overblow stuff😅 blade peke chopp...,0.7351,Positive
2,That should not be an issue think of improvin...,issu think improv high standard live economi...,0.6083,Positive
3,"Insurance does not pay for Campaign incidents,...",insur pay campaign incid 🔥 stone throw reckl...,-0.6254,Negative
4,The INSURANCE company were refusing to PAY for...,insur compani refus pay replac say fault rai...,-0.656,Negative
5,Most of these insurance companies are fake. T...,insur compani fake even belong fellow chopper,-0.4767,Negative
6,"While we condemn this act of foolishness, plea...",condemn act foolish plea motorist ensur year...,-0.34,Negative
7,"I think they have robbed this country enough, ...",think rob countri enough small money nwei al...,-0.1027,Negative
8,They now feel the pain and they never felt the...,feel pain never felt pain use could useful c...,-0.1526,Negative
9,"Arrest those who threw stones, present them in...",arrest threw stone present court pay cost re...,-0.8271,Negative
10,The way you've put a lot of zeal to highlight ...,way youv put lot zeal highlight stori till s...,0.6705,Positive


In [34]:
vd_counts = fin_data["Vader_Analysis"].value_counts()
vd_counts

Neutral     178
Negative    128
Positive    107
Name: Vader_Analysis, dtype: int64

## Named Entity Recognition

In [35]:
nlp = spacy.load("political_ner_model")
fin_data["tags"] = df["clean_tweets"].apply(lambda x: [(tag.text, tag.label_) for tag in nlp(x).ents])
fin_data[1:10]

Unnamed: 0,Comment,Lemma,Vader_Sentiment,Vader_Analysis,tags
1,Media loves overblowing stuff😅 Blades pekee fo...,medium love overblow stuff😅 blade peke chopp...,0.7351,Positive,[]
2,That should not be an issue think of improvin...,issu think improv high standard live economi...,0.6083,Positive,[]
3,"Insurance does not pay for Campaign incidents,...",insur pay campaign incid 🔥 stone throw reckl...,-0.6254,Negative,[]
4,The INSURANCE company were refusing to PAY for...,insur compani refus pay replac say fault rai...,-0.656,Negative,[]
5,Most of these insurance companies are fake. T...,insur compani fake even belong fellow chopper,-0.4767,Negative,[]
6,"While we condemn this act of foolishness, plea...",condemn act foolish plea motorist ensur year...,-0.34,Negative,[]
7,"I think they have robbed this country enough, ...",think rob countri enough small money nwei al...,-0.1027,Negative,[]
8,They now feel the pain and they never felt the...,feel pain never felt pain use could useful c...,-0.1526,Negative,[]
9,"Arrest those who threw stones, present them in...",arrest threw stone present court pay cost re...,-0.8271,Negative,[]


## Get names and political parties

In [36]:
fin_data["Parties"] = fin_data["tags"].apply(lambda x: get_parties(x))
fin_data["Politicians"] = fin_data["tags"].apply(lambda x: get_names(x))
fin_data[0:10]

Unnamed: 0,Comment,Lemma,Vader_Sentiment,Vader_Analysis,tags,Parties,Politicians
0,It must be a special helicopter just like the ...,must special helicopt like standard gaug rai...,0.4588,Positive,[],,[]
1,Media loves overblowing stuff😅 Blades pekee fo...,medium love overblow stuff😅 blade peke chopp...,0.7351,Positive,[],,[]
2,That should not be an issue think of improvin...,issu think improv high standard live economi...,0.6083,Positive,[],,[]
3,"Insurance does not pay for Campaign incidents,...",insur pay campaign incid 🔥 stone throw reckl...,-0.6254,Negative,[],,[]
4,The INSURANCE company were refusing to PAY for...,insur compani refus pay replac say fault rai...,-0.656,Negative,[],,[]
5,Most of these insurance companies are fake. T...,insur compani fake even belong fellow chopper,-0.4767,Negative,[],,[]
6,"While we condemn this act of foolishness, plea...",condemn act foolish plea motorist ensur year...,-0.34,Negative,[],,[]
7,"I think they have robbed this country enough, ...",think rob countri enough small money nwei al...,-0.1027,Negative,[],,[]
8,They now feel the pain and they never felt the...,feel pain never felt pain use could useful c...,-0.1526,Negative,[],,[]
9,"Arrest those who threw stones, present them in...",arrest threw stone present court pay cost re...,-0.8271,Negative,[],,[]


In [37]:
## Get names and political parties

In [38]:
import os
#to get the current working directory
directory = os.getcwd()
print(directory)
#Importing the dataset
dataset = pd.read_csv(directory +'\data_sets\\hate-speech-and-offensive-language-dataset\\labeled_data.csv')
dataset.info()
ke_dataset = pd.read_csv(directory +'\data_sets\\hate-speech-kenya\\HateSpeechKEN.csv')
data_train = pd.read_csv(directory +'\data_sets\\twitter-sentiment-analysis-hatred-speech\\train.csv')

C:\Users\User\python_projects\scripty
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          24783 non-null  int64 
 1   count               24783 non-null  int64 
 2   hate_speech         24783 non-null  int64 
 3   offensive_language  24783 non-null  int64 
 4   neither             24783 non-null  int64 
 5   class               24783 non-null  int64 
 6   tweet               24783 non-null  object
dtypes: int64(6), object(1)
memory usage: 1.3+ MB


In [39]:
dataset.drop(['Unnamed: 0','count','hate_speech','offensive_language','neither'],axis=1,inplace=True)
ke_dataset.drop(['count','hate_speech','offensive_language','neither'],axis=1,inplace=True)
data_train.drop('id', axis=1,inplace=True)

In [40]:
dataset["class"].replace({0: 1}, inplace=True)
ke_dataset["class"].replace({0: 1}, inplace=True)

In [41]:
dataset["class"].replace({2: 0}, inplace=True)
ke_dataset["class"].replace({2: 0}, inplace=True)
dataset.rename(columns ={'class':'label'}, inplace = True)
ke_dataset.rename(columns ={'class':'label'}, inplace = True)

In [42]:
frame=[data_train,dataset,ke_dataset[25500::]]
df = pd.concat(frame)
df.head(5)

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


In [43]:
df['tweet']=df['tweet'].apply(clean_text)
df.head()
df.shape

(81420, 2)

In [44]:
X = df['tweet']
y = df['label']

In [45]:
#splitting the dataset into Trainign and testing data set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [46]:
count = CountVectorizer(stop_words='english', ngram_range=(1,5),max_features=2000)
x_train_vectorizer=count.fit_transform(X_train)
x_test_vectorizer=count.transform(X_test)
x_train_vectorizer.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [47]:
#
tfidf = TfidfTransformer()

x_train_tfidf = tfidf.fit_transform(x_train_vectorizer)

x_train_tfidf.toarray()
x_test_tfidf = tfidf.transform(x_test_vectorizer)

In [48]:
#using Naive Bayes model
classifier_np = MultinomialNB()
classifier_np.fit(x_train_vectorizer, y_train)

MultinomialNB()

In [49]:
#Naive Bayes
y_pred_np = classifier_np.predict(x_test_vectorizer)
cm = confusion_matrix(y_test, y_pred_np)
print(cm)

[[9346 2162]
 [1049 7798]]


In [50]:
#Accuracy of the naive bayes prediction model.
np_score = accuracy_score(y_test, y_pred_np)

print('Naive Bayes Accuracy: ', str(np_score))

Naive Bayes Accuracy:  0.8422500614099729


In [51]:
def detect_hatespeech(text):
    text=[clean_text(text)]
    text_vectorizer=count.transform(text)
    test_tfidf = tfidf.transform(text_vectorizer)
    prediction=classifier_np.predict(test_tfidf)
    category="none"
    if prediction[0]==0:
        category="not hateful"
    else:
        category="hateful and offensive"
    return category
    

In [60]:
fin_data['Hatespeech Category']=fin_data['Comment'].apply(lambda x:detect_hatespeech(x))
fin_data[350:400]

Unnamed: 0,Comment,Lemma,Vader_Sentiment,Vader_Analysis,tags,Parties,Politicians,Hatespeech Category
350,#lower food price before this.kwan tunakula hiyo,low food price thiskwan tunakula hiyo,-0.296,Negative,[],,[],not hateful
351,Handshake brother atalipa hiyo.,handshak brother atalipa hiyo,0.0,Neutral,[],,[],not hateful
352,Ama wanitafute niwape hiyo Windshield at a ch...,ama wanitafut niwap hiyo windshield cheap pr...,0.7184,Positive,[],,[],not hateful
353,Ya KEMSA inyonywe polepole...AND THAT'S AN OR...,ya kemsa inyonyw polepoleand order,0.0,Neutral,[],,[],hateful and offensive
354,Whatever the cost aende huko,whatev cost aend huko,0.0,Neutral,[],,[],not hateful
355,"That's too exargerrated ,This things hapo rive...",exargerr thing hapo riverroad ikienda sana ni,0.0,Neutral,[],,[],hateful and offensive
356,Plastic? 10m? Shuwally,plastic shuwal,0.0,Neutral,[],,[],not hateful
357,So tufanye nini? Switch your attention to some...,tufany nini switch attent someth el hii imek...,0.0,Neutral,[],,[],hateful and offensive
358,Baba is a chopper himself...I'm sure he flew b...,baba chopper himselfim sure flew back home d...,0.4919,Positive,[],,[],not hateful
359,Mko sure hakuna windscreen ya river road?,mko sure hakuna windscreen ya river road,0.3182,Positive,[],,[],hateful and offensive
