In [1]:
import pandas as pd
import numpy as np
import re
import tqdm

from nltk import tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split

In [2]:
dataset=pd.read_csv('data/dataset.csv')

In [3]:
dataset.head()

Unnamed: 0,id,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [4]:
stop_words= set(stopwords.words('english'))

In [5]:

def clean_tweet(tweet):
    tweet = re.sub("#", "",tweet) # Removing '#' from hashtags
    tweet = re.sub("[^a-zA-Z#]", " ",tweet) # Removing punctuation and special characters
    tweet = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',"<URL>", tweet)
    tweet = re.sub('http','',tweet)
    tweet = re.sub(" +", " ", tweet)
    tweet = tweet.lower()
    tweet = word_tokenize(tweet)
    return_tweet=[]
    for word in tweet:
        if word not in stop_words:
            return_tweet.append(word)
    return return_tweet

In [6]:
dataset["tweet"]=dataset["tweet"].apply(clean_tweet)

## Word2Vec model to get the word embedings.

In [7]:
model = Word2Vec(dataset["tweet"].values, size=50, window=5, min_count=1, workers=4)

In [8]:
def get_features(tweet):
    features=[]
    for word in tweet:
        features.append(model.wv[word])
    return np.mean(features,0)


In [9]:
dataset["features"]=dataset["tweet"].apply(get_features)


In [10]:
data=[]
for i in dataset["features"].values:
    temp=[]
    for j in i:
        temp.append(j)
    data.append(temp)
data=np.array(data)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(data, dataset["class"].values, test_size=0.2, random_state=42)

# MODEL

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

## LOGISTIC REGRESSION MODEL

In [20]:
lr_clf = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial')
lr_clf.fit(X_train,y_train)
y_pred = lr_clf.predict(X_train)
f = f1_score(y_train, y_pred, average='micro')
print("F1 Score: ", f)
p = precision_score(y_train, y_pred, average='micro')
print("Precision Score: ", p)
r = recall_score(y_train, y_pred, average='micro')
print("Recall Score: ", r)
print("Accuracy: ", lr_clf.score(X_test,y_test))

F1 Score:  0.8434379098153939
Precision Score:  0.8434379098153939
Recall Score:  0.8434379098153939
Accuracy:  0.8396207383498083




## SVM MODEL

In [22]:
svm_clf = svm.SVC(gamma='scale')
svm_clf.fit(X_train,y_train)
y_pred = svm_clf.predict(X_train)
f = f1_score(y_train, y_pred, average='micro')
print("F1 Score: ", f)
p = precision_score(y_train, y_pred, average='micro')
print("Precision Score: ", p)
r = recall_score(y_train, y_pred, average='micro')
print("Recall Score: ", r)
print("Accuracy: ", svm_clf.score(X_test,y_test))

F1 Score:  0.8159991929789165
Precision Score:  0.8159991929789165
Recall Score:  0.8159991929789165
Accuracy:  0.8146056082307848
