**Dataset**
labeled datasset collected from twitter

**Objective**
classify tweets containing hate speech from other tweets.
0 -> no hate speech
1 -> contains hate speech

**Total Estimated Time = 90 Mins**

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import re
import nltk
from collections import Counter
import random
from termcolor import colored
#nltk.download('all')

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, precision_score
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline

from tqdm.auto import tqdm

### Load Dataset

In [None]:
data = pd.read_csv('/content/data_cleaned.csv')

data.head()

Unnamed: 0.1,Unnamed: 0,label,tweet
0,0,0,when a father is dysfunctional and is so sel...
1,1,0,thanks for lyft credit i cant use cause they...
2,2,0,bihday your majesty
3,3,0,model i love u take with u all the time in u...
4,4,0,factsguide society now motivation


### EDA

look at distributions
roc good with unbalanced data : use macro

In [None]:
# drop id 
data  = data.drop('id', axis=1)
data.head()

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


- check NaNs

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   31962 non-null  int64 
 1   tweet   31962 non-null  object
dtypes: int64(1), object(1)
memory usage: 499.5+ KB


- check duplicates

In [None]:
data.duplicated().sum()

2432

In [None]:
data = data.drop_duplicates()

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29530 entries, 0 to 31961
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   29530 non-null  int64 
 1   tweet   29530 non-null  object
dtypes: int64(1), object(1)
memory usage: 692.1+ KB


- show samples of data texts to find out required preprocessing steps

In [None]:
print(data['tweet'][0], '\n')
print(data['tweet'][1], '\n')
print(data['tweet'][2], '\n')
print(data['tweet'][3], '\n')
print(data['tweet'][4], '\n')
print(data['tweet'][50], '\n')

 @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run 

@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked 

  bihday your majesty 

#model   i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦   

 factsguide: society now    #motivation 

#abc2020 getting ready 2 remove the victums frm #pulseclub #prayfororlando   



- check dataset balancing

In [None]:
data['label'].value_counts()

0    27517
1     2013
Name: label, dtype: int64

- Cleaning and Preprocessing are:
    - 1 lower case
    - 2 removings @ and url and #
    - 3 remove punctuations and sympoles

### Cleaning and Preprocessing

In [None]:
data_clean = data.copy()

In [None]:
# Lower case the text
data_clean['tweet'] = data_clean['tweet'].apply(lambda w : w.lower())

data_clean['tweet'][51]

'for her #bihday we got her a #nose #job @user  ð\x9f\x8e\x88ð\x9f\x90¶ð\x9f\x8e\x89ð\x9f\x8e\x82ð\x9f\x8e\x81    #bihday #petunia we love you ð\x9f\x99\x83 '

In [None]:
# removes mentions and url 
data_clean['tweet'] = data_clean['tweet'].apply(lambda w : re.sub((r'@\S+ | \#'), ' ', w))
data_clean['tweet'] = data_clean['tweet'].apply(lambda w : re.sub((r'https?\S+ | www\S+ | http\S+'), '', w))

data_clean['tweet'][51]

'for her bihday we got her a nose job   ð\x9f\x8e\x88ð\x9f\x90¶ð\x9f\x8e\x89ð\x9f\x8e\x82ð\x9f\x8e\x81    bihday petunia we love you ð\x9f\x99\x83 '

In [None]:
# remove punctuation
data_clean['tweet'] = data_clean['tweet'].apply(lambda w : re.sub((r'[^\w+\s]'), '', w))

data_clean['tweet'][51]

'for her bihday we got her a nose job   ððððð    bihday petunia we love you ð '

In [None]:
# remove sympoles
data_clean['tweet'] = data_clean['tweet'].apply(lambda w : re.sub((r'[^a-z]'),' ' , w))

data_clean['tweet'][51]

'for her bihday we got her a nose job            bihday petunia we love you   '

In [None]:
# Lemmitization 
# lemmatizer = WordNetLemmatizer()
# w_tokenizer = WhitespaceTokenizer()

# def lemmatizing(data):
#   tweet = [lemmatizer.lemmatize(word) for word in w_tokenizer.tokenize(data)]
#   return data

# #lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

# data_clean['tweet'] = data_clean['tweet'].apply(lambda w : lemmatizing(w))

In [None]:
data_clean['tweet'][5]

'   huge fan fare and big talking before they leave chaos and pay disputes when they get there allshowandnogo  '

In [None]:
data_clean.to_csv('data_cleaned.csv')


**If it takes 60 Mins till here, you are doing Great** <br>
**If not! You also are doing Great**

### Modelling

In [None]:
# get features and labels
X = data_clean['tweet']
Y = data_clean['label']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.2, stratify=Y, random_state=42)

print("Size of x_train:", (x_train.shape))
print("Size of y_train:", (y_train.shape))
print("Size of x_test: ", (x_test.shape))
print("Size of y_test: ", (y_test.shape))

Size of x_train: (23624,)
Size of y_train: (23624,)
Size of x_test:  (5906,)
Size of y_test:  (5906,)


In [None]:
# bag of words
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(x_train)

feature_names = vectorizer.get_feature_names()
print("Number of features: {}\n".format(len(feature_names)))

x_train_v = vectorizer.transform(x_train)
x_test_v = vectorizer.transform(x_test)



Number of features: 33961



In [None]:
logreg = LogisticRegression()
logreg.fit(x_train_v, y_train)

logreg_predict = logreg.predict(x_test_v)
logreg_acc = accuracy_score(logreg_predict, y_test)

In [None]:
print("Test accuarcy: {:.2f}%".format(logreg_acc*100))

Test accuarcy: 95.85%


In [None]:
logreg_perc = precision_score(y_test, logreg_predict)
print("Test percision: {:.2f}%".format(logreg_perc*100))

Test percision: 86.57%


#### Evaluation

In [None]:
y_pred = logreg.predict(x_test_v)
report = classification_report(y_test, y_pred)
print(report)
print("accuracy: {:0.3f}".format(accuracy_score(y_test, y_pred)))

logreg_perc = precision_score(y_test, y_pred)
print("Test percision: {:.2f}%".format(logreg_perc*100))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98      5503
           1       0.87      0.46      0.60       403

    accuracy                           0.96      5906
   macro avg       0.91      0.73      0.79      5906
weighted avg       0.96      0.96      0.95      5906

accuracy: 0.959
Test percision: 86.57%


### Enhancement

- Using different N-grams
- Using different text representation technique

> Try different N_grams

In [None]:
vec = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))

clf = LinearSVC()
pipe_tf = make_pipeline(vec, clf)
pipe_tf.fit(x_train, y_train)

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(ngram_range=(1, 2), stop_words='english')),
                ('linearsvc', LinearSVC())])

In [None]:
y_pred = pipe_tf.predict(x_test)
report = classification_report(y_test, y_pred)
print(report)
print("accuracy: {:0.3f}".format(accuracy_score(y_test, y_pred)))

logreg_perc = precision_score(y_test, y_pred)
print("Test percision: {:.2f}%".format(logreg_perc*100))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      5503
           1       0.90      0.61      0.72       403

    accuracy                           0.97      5906
   macro avg       0.93      0.80      0.85      5906
weighted avg       0.97      0.97      0.97      5906

accuracy: 0.968
Test percision: 89.71%


 Word embedding

In [None]:
! python -m spacy download en_core_web_md

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-md==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.3.0/en_core_web_md-3.3.0-py3-none-any.whl (33.5 MB)
[K     |████████████████████████████████| 33.5 MB 327 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [None]:
word_embed = spacy.load('en_core_web_md')

In [None]:
x_train_v = np.zeros((len(x_train), 300))
x_test_v = np.zeros((len(x_test), 300))

for i, doc in tqdm(enumerate(word_embed.pipe(x_train)), total=len(x_train)):
    x_train_v[i, :] = doc.vector

for i, doc in tqdm(enumerate(word_embed.pipe(x_test)), total=len(x_test)):
    x_test_v[i, :] = doc.vector

  0%|          | 0/23624 [00:00<?, ?it/s]

  0%|          | 0/5906 [00:00<?, ?it/s]

In [None]:
logreg = LogisticRegression(max_iter=500)
#clf = LinearSVC()
logreg.fit(x_train_v, y_train)

logreg_predict = logreg.predict(x_test_v)
logreg_acc = accuracy_score(logreg_predict, y_test)
logreg_perc = precision_score(y_test, logreg_predict)

In [None]:
print("Test accuarcy: {:.2f}%".format(logreg_acc*100))
print("Test percision: {:.2f}%".format(logreg_perc*100))

Test accuarcy: 94.15%
Test percision: 68.14%


#### Done!