In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from wordcloud import WordCloud

#to data preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

#NLP tools
import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

#train split and fit models
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
  
#model selection
from sklearn.metrics import confusion_matrix, accuracy_score


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [46]:
import os
#to get the current working directory
directory = os.getcwd()
#Importing the dataset
dataset = pd.read_csv(directory +'\data_sets\hate-speech-and-offensive-language-dataset\labeled_data.csv')
dataset.info()
ke_dataset = pd.read_csv(directory +'\data_sets\\hate-speech-kenya\\HateSpeechKEN.csv')
data_train = pd.read_csv(directory +'\data_sets\\twitter-sentiment-analysis-hatred-speech\\train.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          24783 non-null  int64 
 1   count               24783 non-null  int64 
 2   hate_speech         24783 non-null  int64 
 3   offensive_language  24783 non-null  int64 
 4   neither             24783 non-null  int64 
 5   class               24783 non-null  int64 
 6   tweet               24783 non-null  object
dtypes: int64(6), object(1)
memory usage: 1.3+ MB


In [3]:
dataset.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,24783.0,12681.192027,7299.553863,0.0,6372.5,12703.0,18995.5,25296.0
count,24783.0,3.243473,0.88306,3.0,3.0,3.0,3.0,9.0
hate_speech,24783.0,0.280515,0.631851,0.0,0.0,0.0,0.0,7.0
offensive_language,24783.0,2.413711,1.399459,0.0,2.0,3.0,3.0,9.0
neither,24783.0,0.549247,1.113299,0.0,0.0,0.0,0.0,9.0
class,24783.0,1.110277,0.462089,0.0,1.0,1.0,1.0,2.0


In [4]:
dataset.shape

(24783, 7)

In [5]:
dataset.head(10)

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
5,5,3,1,2,0,1,"!!!!!!!!!!!!!!!!!!""@T_Madison_x: The shit just..."
6,6,3,0,3,0,1,"!!!!!!""@__BrighterDays: I can not just sit up ..."
7,7,3,0,3,0,1,!!!!&#8220;@selfiequeenbri: cause I'm tired of...
8,8,3,0,3,0,1,""" &amp; you might not get ya bitch back &amp; ..."
9,9,3,1,2,0,1,""" @rhythmixx_ :hobbies include: fighting Maria..."


In [6]:
dataset.drop(['Unnamed: 0','count','hate_speech','offensive_language','neither'],axis=1,inplace=True)
ke_dataset.drop(['count','hate_speech','offensive_language','neither'],axis=1,inplace=True)
data_train.drop('id', axis=1,inplace=True)

In [7]:
dataset["class"].replace({0: 1}, inplace=True)
ke_dataset["class"].replace({0: 1}, inplace=True)

In [8]:
dataset['class'].unique()

array([2, 1], dtype=int64)

In [9]:
dataset["class"].replace({2: 0}, inplace=True)
ke_dataset["class"].replace({2: 0}, inplace=True)
dataset.rename(columns ={'class':'label'}, inplace = True)
ke_dataset.rename(columns ={'class':'label'}, inplace = True)

In [10]:
frame=[data_train,dataset,ke_dataset[25500::]]
df = pd.concat(frame)
df.head(5)

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


In [11]:
stemmer = nltk.SnowballStemmer("english")
stopword=set(stopwords.words('english'))
def clean_text(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text

In [12]:
df['tweet']=df['tweet'].apply(clean_text)
df.head()
df.shape

(81420, 2)

In [13]:
X = df['tweet']
y = df['label']

In [14]:
#splitting the dataset into Trainign and testing data set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [15]:
print(len(X_train), len(y_train))
print(len(X_test), len(y_test))

61065 61065
20355 20355


In [16]:
count = CountVectorizer(stop_words='english', ngram_range=(1,5),max_features=2000)
x_train_vectorizer=count.fit_transform(X_train)
x_test_vectorizer=count.transform(X_test)
x_train_vectorizer.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [17]:
#
tfidf = TfidfTransformer()

x_train_tfidf = tfidf.fit_transform(x_train_vectorizer)

x_train_tfidf.toarray()
x_test_tfidf = tfidf.transform(x_test_vectorizer)

In [34]:
#Using the svm model to predict hate speech and offensive language
classifier_svm = svm.SVC()
classifier_svm.fit(x_train_vectorizer, y_train)

SVC()

In [18]:
#using Naive Bayes model
classifier_np = MultinomialNB()
classifier_np.fit(x_train_vectorizer, y_train)

MultinomialNB()

In [19]:
#using Decision tree model
classifier_dt = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier_dt.fit(x_train_vectorizer, y_train)

DecisionTreeClassifier(criterion='entropy', random_state=0)

In [20]:
#using KNN model
classifier_knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier_knn.fit(x_train_vectorizer, y_train)

KNeighborsClassifier()

In [21]:
#using logistic regression model
classifier_lr = LogisticRegression(random_state = 0)
classifier_lr.fit(x_train_vectorizer, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=0)

In [22]:
#using random forest model
classifier_rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier_rf.fit(x_train_vectorizer, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)

In [40]:
#Making the Confusion matrix for each model
#SVM
y_pred_sv = classifier_svm.predict(x_test_vectorizer)
cm = confusion_matrix(y_test, y_pred_sv)
print(cm)

[[10609   899]
 [ 1606  7241]]


In [23]:
#Naive Bayes
y_pred_np = classifier_np.predict(x_test_vectorizer)
cm = confusion_matrix(y_test, y_pred_np)
print(cm)

[[9346 2162]
 [1049 7798]]


In [24]:
#Decision Tree
y_pred_dt = classifier_dt.predict(x_test_vectorizer)
cm = confusion_matrix(y_test, y_pred_dt)
print(cm)

[[9839 1669]
 [1675 7172]]


In [25]:
#Linear Regression
y_pred_lr = classifier_lr.predict(x_test_vectorizer)
cm = confusion_matrix(y_test, y_pred_lr)
print(cm)

[[10556   952]
 [ 1706  7141]]


In [26]:
#Random Florest
y_pred_rf = classifier_rf.predict(x_test_vectorizer)
cm = confusion_matrix(y_test, y_pred_rf)
print(cm)

[[10195  1313]
 [ 1613  7234]]


In [27]:
#comparing the acurracy of the different
# models to pick the best
#sv_score = accuracy_score(y_test, y_pred_sv)
rf_score = accuracy_score(y_test, y_pred_rf)
lr_score = accuracy_score(y_test, y_pred_lr)
dt_score = accuracy_score(y_test, y_pred_dt)
np_score = accuracy_score(y_test, y_pred_np)


#print('SVM: ', str(sv_score))
print('Random Forest Accuracy: ', str(rf_score))
print('Linear Regression Accuracy: ', str(lr_score))
print('Decision Tree Accuracy: ', str(dt_score))
print('Naive Bayes Accuracy: ', str(np_score))

Random Forest Accuracy:  0.8562515352493245
Linear Regression Accuracy:  0.8694178334561533
Decision Tree Accuracy:  0.8357160402849423
Naive Bayes Accuracy:  0.8422500614099729


In [28]:
#Import the data we are going to predict
data=pd.read_csv(directory+'\data_files\Tweet_replies.csv')
data.info()
n=len(data)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 465 entries, 0 to 464
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   User      465 non-null    object
 1   Reply     465 non-null    object
 2   Likes     465 non-null    int64 
 3   Retweets  465 non-null    int64 
dtypes: int64(2), object(2)
memory usage: 14.7+ KB


In [44]:
def detect_hatespeech(text):
    text=[clean_text(text)]
    text_vectorizer=count.transform(text)
    test_tfidf = tfidf.transform(text_vectorizer)
    prediction=classifier_np.predict(test_tfidf)
    category="none"
    if prediction[0]==0:
        category="not hateful"
    else:
        category="hateful and offensive"
    return category
    

In [45]:
text1="unakaa kama ngombe"#Swahili text translating to "you look like a cow"
text2="We shall revange all the death they have caused"
text3="the only way we move forward is together"
text4="yet you still want to participate in his ignorant interpretation of the law..?? @iebckenya #noreformsnoelections"
text5="I wish for ounce if there were no killings"
text6="I wish for more killing"
detect_hatespeech(text1)

'hateful and offensive'

In [41]:
data['Hatespeech Category']=data['Reply'].apply(lambda x:detect_hatespeech(x))
data[0:50]

Unnamed: 0,User,Reply,Likes,Retweets,Hatespeech Category
0,atienooloo1,@StandardKenya 🤣🤣🤣🤣🤣🤣,0,0,not hateful
1,ngarijoseph802,@StandardKenya The 5th we have been loking for...,0,0,hateful and offensive
2,Governorjaymo,@StandardKenya Perfect the president we need t...,0,0,not hateful
3,MNyabende,@StandardKenya Is presidential candidature thi...,0,0,hateful and offensive
4,yours_sincereIy,@StandardKenya Finally we got someone who's se...,0,0,not hateful
5,KiumbeSammy,@StandardKenya Look for another country not k...,0,0,not hateful
6,binraccooon,@StandardKenya https://t.co/9AqYnLtCEs,0,0,not hateful
7,arap_chesulut,@StandardKenya Bhangi ni mbaya......,0,0,hateful and offensive
8,GoodContentHere,@StandardKenya @sain4847 Following the lead of...,1,0,not hateful
9,JosephKimanga3,@StandardKenya Bhang working in your head my f...,0,0,not hateful


In [43]:
data[34:35]

Unnamed: 0,User,Reply,Likes,Retweets,Hatespeech Category
34,Quiplane_1,"@StandardKenya Planting mine already, by the t...",0,0,not hateful
