Importing Dependencies

In [45]:
import sklearn
import numpy as np
import pandas as pd

Import Data

In [46]:
# importing training data 
train = pd.read_csv("train.csv")

#importing test data
test = pd.read_csv("test.csv")

Data Exploration

In [47]:
train.head(10)

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
5,6,0,[2/2] huge fan fare and big talking before the...
6,7,0,@user camping tomorrow @user @user @user @use...
7,8,0,the next school year is the year for exams.ð...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...
9,10,0,@user @user welcome here ! i'm it's so #gr...


In [48]:
test.tail()

Unnamed: 0,id,tweet
17192,49155,thought factory: left-right polarisation! #tru...
17193,49156,feeling like a mermaid ð #hairflip #neverre...
17194,49157,#hillary #campaigned today in #ohio((omg)) &am...
17195,49158,"happy, at work conference: right mindset leads..."
17196,49159,"my song ""so glad"" free download! #shoegaze ..."


In [49]:
#non-hate sentiment related tweets
sum(train["label"] == 0)

29720

In [50]:
#hate sentiment related tweets
sum(train["label"] == 1)

2242

In [51]:
#checking for missing values
train.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

Data Cleaning

In [52]:
#cleaning up tweets with tweet-preprocessor
%pip install tweet-preprocessor

You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [53]:
#removing special characters using regular expression library
import re

#replaced punctuations
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\|)|(\()|(\))|(\[)|(\])|(\%)|(\$)|(\>)|(\<)|(\{)|(\})")
REPLACE_WITH_SPACE = re.compile("(<br\s/><br\s/?)|(-)|(/)|(:).")

In [54]:
import preprocessor as p

def clean_tweets(df): 
    tempArr = []
    for line in df:
        tmpL = p.clean(line)
        tmpL = REPLACE_NO_SPACE.sub("", tmpL.lower())
        tmpL = REPLACE_WITH_SPACE.sub(" ", tmpL)
        tempArr.append(tmpL)
    return tempArr

In [55]:
#cleaning training data
train_tweet = clean_tweets(train["tweet"])
train_tweet = pd.DataFrame(train_tweet)

In [56]:
train["clean_tweet"] = train_tweet
train.head(10)

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for credit i cant use cause they dont o...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,i love u take with u all the time in ur
4,5,0,factsguide: society now #motivation,factsguide society now
5,6,0,[2/2] huge fan fare and big talking before the...,2 2 huge fan fare and big talking before they ...
6,7,0,@user camping tomorrow @user @user @user @use...,camping tomorrow danny
7,8,0,the next school year is the year for exams.ð...,the next school year is the year for exams can...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...,we won love the land
9,10,0,@user @user welcome here ! i'm it's so #gr...,welcome here im its so


In [57]:
#cleaning test data and append cleaned data to test data
test_tweet = clean_tweets(test["tweet"])
test_tweet = pd.DataFrame(test_tweet)
test["clean_tweet"] = test_tweet
test.tail()

Unnamed: 0,id,tweet,clean_tweet
17192,49155,thought factory: left-right polarisation! #tru...,thought factory left right polarisation &gt3
17193,49156,feeling like a mermaid ð #hairflip #neverre...,feeling like a mermaid
17194,49157,#hillary #campaigned today in #ohio((omg)) &am...,today in omg &amp used words like assets&ampli...
17195,49158,"happy, at work conference: right mindset leads...",happy at work conference right mindset leads t...
17196,49159,"my song ""so glad"" free download! #shoegaze ...",my song so glad free download


Test & Train Split

In [58]:
from sklearn.model_selection import train_test_split

#extract label values
y = train.label.values

x_train, x_test, y_train, y_test = train_test_split(train.clean_tweet.values, y, 
                                                    stratify=y, 
                                                    random_state=1, 
                                                    test_size=0.3, shuffle=True)

Vectorising Tweets

In [59]:
from sklearn.feature_extraction.text import CountVectorizer

In [60]:
documents = ["This is a NLP project designed to recognaise the hate speech data on twitter.", "Data science project.", "There is test and training data."]

vectorizer = CountVectorizer()
document_term_matrix = vectorizer.fit_transform(documents)
pd.DataFrame(document_term_matrix.toarray(), columns = vectorizer.get_feature_names_out())

Unnamed: 0,and,data,designed,hate,is,nlp,on,project,recognaise,science,speech,test,the,there,this,to,training,twitter
0,0,1,1,1,1,1,1,1,1,0,1,0,1,0,1,1,0,1
1,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0
2,1,1,0,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0


In [61]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(binary=True, stop_words='english')
# learn a vocabulary dictionary of all tokens in the raw documents
vectorizer.fit(list(x_train) + list(x_test))
x_train_vec = vectorizer.transform(x_train)
x_test_vec = vectorizer.transform(x_test)

Model Building
Using support vector classifier (SVC)

In [62]:
from sklearn import svm
# classify using support vector classifier
svm = svm.SVC(kernel = 'linear', probability=True)

# fit the SVC model based on the given training data
prob = svm.fit(x_train_vec, y_train).predict_proba(x_test_vec)

# perform classification and prediction on samples in x_test
y_pred_svm = svm.predict(x_test_vec)

Accuracy Calculation Of SVC

In [63]:
from sklearn.metrics import accuracy_score
print("The accuracy of the model is: ", accuracy_score(y_test, y_pred_svm)*100, "%")

The accuracy of the model is:  94.86912086766085 %
