# Hate Speech Detection

In [82]:
import pandas as pd
import string
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
%matplotlib inline

In [83]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer as VS

In [84]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
import seaborn as sns
from textstat.textstat import *
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [85]:
## Reading the dataset

In [86]:
data = pd.read_csv("Hatespeech.csv")
data

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...,...,...,...,...,...
24778,25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
24780,25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,25295,6,0,6,0,1,youu got wild bitches tellin you lies


In [87]:
data['text length'] = data['tweet'].apply(len)

In [88]:
# collecting only the tweets from the csv file into a variable name tweet
tweet=data.tweet

### Preprocessing of the major 'x' variable

In [89]:
## 1. Removal of punctuation and capitlization
## 2. Tokenizing
## 3. Removal of stopwords
## 4. Stemming

stopwords = nltk.corpus.stopwords.words("english")

#extending the stopwords to include other words used in twitter such as retweet(rt) etc.
other_exclusions = ["#ff", "ff", "rt"]
stopwords.extend(other_exclusions)
stemmer = PorterStemmer()

def preprocess(tweet):  
    
    # removal of extra spaces
    regex_pat = re.compile(r'\s+')
    tweet_space = tweet.str.replace(regex_pat, ' ')

    # removal of @name[mention]
    regex_pat = re.compile(r'@[\w\-]+')
    tweet_name = tweet_space.str.replace(regex_pat, '')

    # removal of links[https://abc.com]
    giant_url_regex =  re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
            '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    tweets = tweet_name.str.replace(giant_url_regex, '')
    
    # removal of punctuations and numbers
    punc_remove = tweets.str.replace("[^a-zA-Z]", " ")
    
    # remove whitespace with a single space
    newtweet=punc_remove.str.replace(r'\s+', ' ')
    
    # remove leading and trailing whitespace
    newtweet=newtweet.str.replace(r'^\s+|\s+?$','')
    
    # replace normal numbers with numbr
    newtweet=newtweet.str.replace(r'\d+(\.\d+)?','numbr')
    
    # removal of capitalization
    tweet_lower = newtweet.str.lower()
    
    # tokenizing
    tokenized_tweet = tweet_lower.apply(lambda x: x.split())
    
    # removal of stopwords
    tokenized_tweet=  tokenized_tweet.apply(lambda x: [item for item in x if item not in stopwords])
    
    # stemming of the tweets
    tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) 
    
    for i in range(len(tokenized_tweet)):
        tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
        tweets_p= tokenized_tweet
    
    return tweets_p

processed_tweets = preprocess(tweet)   

data['processed_tweets'] = processed_tweets
print(data['tweet'])
print(data['processed_tweets'])

0        !!! RT @mayasolovely: As a woman you shouldn't...
1        !!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2        !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3        !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4        !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
                               ...                        
24778    you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779    you've gone and broke the wrong heart baby, an...
24780    young buck wanna eat!!.. dat nigguh like I ain...
24781                youu got wild bitches tellin you lies
24782    ~~Ruffled | Ntac Eileen Dahlia - Beautiful col...
Name: tweet, Length: 24783, dtype: object
0        woman complain clean hous amp man alway take t...
1        boy dat cold tyga dwn bad cuffin dat hoe st place
2               dawg ever fuck bitch start cri confus shit
3                                         look like tranni
4           shit hear might true might faker bitch told ya
              

### Feature Engineering

In [90]:
### TFIDF

In [91]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1),max_df=0.75, min_df=5, max_features=10000) 
tfidf = tfidf_vectorizer.fit_transform(data['processed_tweets'] )
print(tfidf)

  (0, 3256)	0.23595912062624097
  (0, 3093)	0.2986258937133739
  (0, 82)	0.31566747710288195
  (0, 1911)	0.277055162080436
  (0, 92)	0.25434849363318784
  (0, 1500)	0.3598645693124405
  (0, 590)	0.41202797006381253
  (0, 644)	0.4189537059692329
  (0, 3513)	0.37316943838034783
  (1, 2355)	0.31439885687688546
  (1, 2962)	0.3216411481533663
  (1, 1456)	0.13049776995721835
  (1, 747)	0.4039321354558617
  (1, 208)	0.22271869008556353
  (1, 3315)	0.38111954961355066
  (1, 620)	0.3216411481533663
  (1, 782)	0.5034641143537215
  (1, 378)	0.256368141454617
  (2, 2774)	0.25540207934700226
  (2, 656)	0.4816691891659384
  (2, 721)	0.4052353351174557
  (2, 2977)	0.36255541929118634
  (2, 310)	0.11739394282752598
  (2, 1230)	0.22248822640573884
  (2, 1019)	0.3540533390645287
  :	:
  (24779, 1305)	0.31572476341184663
  (24779, 409)	0.32774262342050303
  (24780, 427)	0.4462495739813302
  (24780, 2154)	0.34611714787005704
  (24780, 846)	0.3400462611693226
  (24780, 1233)	0.29675145045626455
  (24780, 9

In [92]:
tfidf_a = tfidf.toarray()
tfidf_a.shape

(24783, 3597)

In [93]:
# Sentiment Analysis, using polarity scores as features....

In [94]:
import nltk
nltk.download('punkt')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sasha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Sasha\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [95]:
sentiment_analyzer = VS()

# Method to tag tags
def count_tags(tweet_c):  
    
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', tweet_c)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
    return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))

# Method to capture and analyse sentiment
def sentiment_analysis(tweet):   
    sentiment = sentiment_analyzer.polarity_scores(tweet)    
    twitter_objs = count_tags(tweet)
    features = [sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound'],twitter_objs[0], twitter_objs[1],
                twitter_objs[2]]
    #features = pandas.DataFrame(features)
    return features

# Method to store the sentimental analysis features
def sentiment_analysis_array(tweets):
    features=[]
    for t in tweets:
        features.append(sentiment_analysis(t))
    return np.array(features)

final_features = sentiment_analysis_array(tweet)
final_features

array([[0.   , 0.12 , 0.88 , ..., 0.   , 1.   , 0.   ],
       [0.237, 0.   , 0.763, ..., 0.   , 1.   , 0.   ],
       [0.538, 0.   , 0.462, ..., 0.   , 2.   , 0.   ],
       ...,
       [0.   , 0.219, 0.781, ..., 0.   , 0.   , 0.   ],
       [0.573, 0.   , 0.427, ..., 0.   , 0.   , 0.   ],
       [0.   , 0.218, 0.782, ..., 1.   , 0.   , 0.   ]])

In [96]:
# Converting the array to Dataframe

new_features = pd.DataFrame({'-ve':final_features[:,0],'+ve':final_features[:,1],'Neutral':final_features[:,2],'Compound':final_features[:,3],
                            'url_tag':final_features[:,4],'mention_tag':final_features[:,5],'hash_tag':final_features[:,6]})
new_features

Unnamed: 0,-ve,+ve,Neutral,Compound,url_tag,mention_tag,hash_tag
0,0.000,0.120,0.880,0.4563,0.0,1.0,0.0
1,0.237,0.000,0.763,-0.6876,0.0,1.0,0.0
2,0.538,0.000,0.462,-0.9550,0.0,2.0,0.0
3,0.000,0.344,0.656,0.5673,0.0,2.0,0.0
4,0.249,0.081,0.669,-0.7762,0.0,1.0,1.0
...,...,...,...,...,...,...,...
24778,0.000,0.000,1.000,0.0000,0.0,3.0,3.0
24779,0.454,0.000,0.546,-0.8074,0.0,0.0,0.0
24780,0.000,0.219,0.781,0.4738,0.0,0.0,0.0
24781,0.573,0.000,0.427,-0.7717,0.0,0.0,0.0


In [97]:
### Doc2Vec

In [98]:
# Vectorization using Doc2Vec
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [99]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(data["processed_tweets"].apply(lambda x: x.split(" ")))]

# Usage of Doc2Vec
model = Doc2Vec(documents,vector_size=5, window=2, min_count=1, workers=4)

In [100]:
# transform each document into a vector data
doc2vec_df = data["processed_tweets"].apply(lambda x: model.infer_vector(x.split(" "))).apply(pd.Series)
doc2vec_df.columns = ["doc2vec_vector_" + str(x) for x in doc2vec_df.columns]
doc2vec_df

Unnamed: 0,doc2vec_vector_0,doc2vec_vector_1,doc2vec_vector_2,doc2vec_vector_3,doc2vec_vector_4
0,-0.108189,0.121618,0.075379,-0.126867,0.096271
1,0.211395,-0.001300,0.208813,0.010700,-0.174618
2,0.022844,0.069357,0.035946,0.051808,-0.003198
3,0.042248,0.169579,-0.025842,0.112476,-0.021298
4,0.075771,0.068321,-0.047348,-0.057166,-0.066041
...,...,...,...,...,...
24778,0.314745,0.071614,0.314652,-0.208930,-0.072123
24779,0.000051,-0.033308,0.138106,0.052521,-0.144314
24780,0.009314,0.112151,0.124378,0.172187,-0.337893
24781,-0.036085,0.056860,0.204542,-0.023488,-0.073174


In [101]:
### Clubing of tf-idf scores, sentiment scores and doc2vec columns

In [102]:
modelling_features = np.concatenate([tfidf_a,final_features,doc2vec_df],axis=1) #To add tfidf_a
modelling_features.shape

(24783, 3609)

In [122]:
#Running the models Using TFIDF with additional features from sentiment analysis and doc2vec
### Independent variable

In [104]:
X = pd.DataFrame(modelling_features)
X.shape

(24783, 3609)

In [105]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3599,3600,3601,3602,3603,3604,3605,3606,3607,3608
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.880,0.4563,0.0,1.0,0.0,-0.108189,0.121618,0.075379,-0.126867,0.096271
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.763,-0.6876,0.0,1.0,0.0,0.211395,-0.001300,0.208813,0.010700,-0.174618
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.462,-0.9550,0.0,2.0,0.0,0.022844,0.069357,0.035946,0.051808,-0.003198
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.656,0.5673,0.0,2.0,0.0,0.042248,0.169579,-0.025842,0.112476,-0.021298
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.669,-0.7762,0.0,1.0,1.0,0.075771,0.068321,-0.047348,-0.057166,-0.066041
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.000,0.0000,0.0,3.0,3.0,0.314745,0.071614,0.314652,-0.208930,-0.072123
24779,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.546,-0.8074,0.0,0.0,0.0,0.000051,-0.033308,0.138106,0.052521,-0.144314
24780,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.781,0.4738,0.0,0.0,0.0,0.009314,0.112151,0.124378,0.172187,-0.337893
24781,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.427,-0.7717,0.0,0.0,0.0,-0.036085,0.056860,0.204542,-0.023488,-0.073174


In [106]:
### Dependent variable

In [107]:
y = data['class'].astype(int)

In [108]:
### Splitting the dataset

In [109]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, test_size=0.2)

In [110]:
X_train.shape

(19826, 3609)

In [111]:
X_test.shape

(4957, 3609)

In [112]:
y_train.shape

(19826,)

In [113]:
y_test.shape

(4957,)

In [114]:
### Model training and prediction

In [115]:
model =GaussianNB()
model.fit(X_train,y_train)

In [116]:
y_pred = model.predict(X_test)
y_pred.shape

(4957,)

In [117]:
X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3599,3600,3601,3602,3603,3604,3605,3606,3607,3608
13665,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.455,-0.4215,0.0,0.0,0.0,0.123404,-0.017615,0.016505,-0.149585,-0.105121
6754,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.761,-0.7845,0.0,1.0,0.0,0.127012,0.075007,0.125141,-0.100401,0.060806
6417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.604,-0.802,0.0,1.0,0.0,-0.161177,0.043822,0.067374,0.078275,-0.159729
10324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.444,0.8126,0.0,0.0,0.0,0.169851,-0.103933,0.196621,-0.241811,-0.044548
4265,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.632,-0.5423,0.0,1.0,0.0,0.040192,0.150314,-0.061168,-0.000833,0.005765


In [118]:
report = classification_report( y_test, y_pred )
print(report)

              precision    recall  f1-score   support

           0       0.07      0.51      0.12       285
           1       0.84      0.31      0.46      3815
           2       0.38      0.61      0.47       857

    accuracy                           0.38      4957
   macro avg       0.43      0.48      0.35      4957
weighted avg       0.71      0.38      0.44      4957



In [119]:
acc=accuracy_score(y_test,y_pred)
print("Logistic Regression with sentimenrtal analysis and TFIDF, Accuracy Score:" , acc)

Logistic Regression with sentimenrtal analysis and TFIDF, Accuracy Score: 0.3770425660681864


In [120]:
import pickle

In [121]:
pickle.dump(model, open('model.pkl', 'wb'))