## Project: Help Twitter Combat Hate Speech Using NLP and Machine Learning

### Submitted: Yanda Sebega
### Date: 
### Module: Natural Language Processing

<hr>

### Libraries

In [1]:
# Import relevant libraries
import pandas as pd
import numpy as np

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from collections import Counter

from sklearn.model_selection import (train_test_split, cross_val_score, GridSearchCV, StratifiedKFold)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (classification_report, f1_score, accuracy_score)
from sklearn.feature_extraction.text import TfidfVectorizer 

<hr>

### 1. Data

In [2]:
#1.1 Load the tweets file using read_csv function from Pandas package. 
df = pd.read_csv('TwitterHate.csv')

In [3]:
len(df)

31962

<hr>

### 2.Tweets in list

In [4]:
#2.1 Get the tweets into a list for easy text cleanup and manipulation.
df.tweet.sample().values

array(['good days with good friends ð\x9f\x98\x98 @user #summer    #friendsâ\x80¦ '],
      dtype=object)

In [5]:
tweets = df.tweet.values

<hr>

### 3.Cleanup

In [6]:
#3.1 Normalize the casing.
tweets_normalized = [tweet.lower() for tweet in tweets]

In [7]:
print(tweets_normalized[0:3])

[' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run', "@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked", '  bihday your majesty']


In [8]:
#3.2 Using regular expressions, remove user handles. These begin with '@’. 
pattern = "(@\w+)"
tweets_no_handles = [re.sub(pattern, '', tweet) for tweet in tweets_normalized]

In [9]:
print(tweets_no_handles[0:3])

['  when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run', "  thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked", '  bihday your majesty']


In [10]:
#3.3 Using regular expressions, remove URLs.
pattern = '(\w+:\/\/\S+)' 
tweets_no_urls = [re.sub(pattern, '', tweet) for tweet in tweets_no_handles]

In [11]:
#3.4 Using TweetTokenizer from NLTK, tokenize the tweets into individual terms.
twtkr = TweetTokenizer()
tweets_tokenized = [twtkr.tokenize(tweet) for tweet in tweets_no_urls]

In [12]:
print(tweets_tokenized[0:2])

[['when', 'a', 'father', 'is', 'dysfunctional', 'and', 'is', 'so', 'selfish', 'he', 'drags', 'his', 'kids', 'into', 'his', 'dysfunction', '.', '#run'], ['thanks', 'for', '#lyft', 'credit', 'i', "can't", 'use', 'cause', 'they', "don't", 'offer', 'wheelchair', 'vans', 'in', 'pdx', '.', '#disapointed', '#getthanked']]


In [13]:
#3.5 Remove stop words.
stop_words = stopwords.words('english')
tweets_no_stopwords = [word for tweet in tweets_tokenized for word in tweet if word not in stop_words]

In [14]:
print(tweets_no_stopwords[0:30])

['father', 'dysfunctional', 'selfish', 'drags', 'kids', 'dysfunction', '.', '#run', 'thanks', '#lyft', 'credit', "can't", 'use', 'cause', 'offer', 'wheelchair', 'vans', 'pdx', '.', '#disapointed', '#getthanked', 'bihday', 'majesty', '#model', 'love', 'u', 'take', 'u', 'time', 'urð']


In [15]:
#3.6 Remove redundant terms like ‘amp’, ‘rt’, etc.
pattern = ' amp | rt '
tweets_no_ = [re.sub(pattern, '', word) for word in tweets_no_stopwords]

In [16]:
#3.7 Remove ‘#’ symbols from the tweet while retaining the term.
hashtags = [word for word in tweets_no_stopwords if '#' in word]

In [17]:
print(hashtags[0:15])

['#run', '#lyft', '#disapointed', '#getthanked', '#model', '#motivation', '#allshowandnogo', '#school', '#exams', '#hate', '#imagine', '#actorslife', '#revolutionschool', '#girl', '#allin']


In [18]:
hashtags_cleaned = [re.sub('#', '', word) for word in hashtags]

In [19]:
print(hashtags_cleaned[0:20])

['run', 'lyft', 'disapointed', 'getthanked', 'model', 'motivation', 'allshowandnogo', 'school', 'exams', 'hate', 'imagine', 'actorslife', 'revolutionschool', 'girl', 'allin', 'cavs', 'champions', 'cleveland', 'clevelandcavaliers', 'gr8']


<hr>

### 4. Extra cleanup

In [20]:
#Extra cleanup by removing terms with a length of 1.
single_letter = " *\\b[[:alpha:]]{1}\\b *" #CHECK THIS ?????????????
extra_cleaned_tweets =[word for word in hashtags_cleaned if len(word)>1]

In [21]:
print(extra_cleaned_tweets[0:30])

['run', 'lyft', 'disapointed', 'getthanked', 'model', 'motivation', 'allshowandnogo', 'school', 'exams', 'hate', 'imagine', 'actorslife', 'revolutionschool', 'girl', 'allin', 'cavs', 'champions', 'cleveland', 'clevelandcavaliers', 'gr8', 'ireland', 'blog', 'silver', 'gold', 'forex', 'orlando', 'standwithorlando', 'pulseshooting', 'orlandoshooting', 'biggerproblems']


<hr>

### 5. Top terms in tweets
Check out the top terms in the tweets

In [22]:
#5.1 First, get all the tokenized terms into one large list.
large_dataset = ' '.join(extra_cleaned_tweets)

In [23]:
print(large_dataset[0:400])

run lyft disapointed getthanked model motivation allshowandnogo school exams hate imagine actorslife revolutionschool girl allin cavs champions cleveland clevelandcavaliers gr8 ireland blog silver gold forex orlando standwithorlando pulseshooting orlandoshooting biggerproblems selfish heabreaking values love 80days gettingfed cnn michigan tcot australia opkillingbay seashepherd helpcovedolphins th


In [24]:
#5.1 Use the counter and find the 10 most common terms.
hashtag_counts = Counter(extra_cleaned_tweets)

In [25]:
hashtag_counts.most_common(10)

[('love', 1542),
 ('positive', 874),
 ('healthy', 571),
 ('smile', 548),
 ('thankful', 491),
 ('fun', 434),
 ('life', 406),
 ('summer', 367),
 ('model', 365),
 ('affirmation', 363)]

<hr>

### 6. Data formatting

In [26]:
#6.1. Join the tokens back to form strings. This will be required for the vectorizers.
tokens =  ' '.join(hashtag_counts)

In [27]:
#6.2 Assign x and y.
X = df['tweet']
y = df['label']

In [28]:
#6.3 Perform train_test_split using sklearn.
x_train, x_test, y_train, y_test = train_test_split(X, y)

<hr>

### 7. TF-IDF

In [29]:
#7.1 Import TF-IDF  vectorizer from sklearn.
# Already imported above...

In [30]:
#7.2 Instantiate with a maximum of 5000 terms in your vocabulary.
vectorizer = TfidfVectorizer(max_features=5000)

In [31]:
#7.3 Fit and apply on the train set.
X_train = vectorizer.fit_transform(x_train)

In [32]:
#7.4 Apply on the test set.
X_test = vectorizer.transform(x_test)

In [33]:
x_train.shape

(23971,)

<hr>

### 8. Ordinary Least Square

In [34]:
#8.1 Instantiate Logistic Regression from sklearn with default parameters.
logistic_regression = LogisticRegression()

In [35]:
#8.2 Fit into the train data.
logistic_regression.fit(X_train, y_train)

LogisticRegression()

In [36]:
#8.3 Make predictions for the train and the test set.
y_pred = logistic_regression.predict(X_test)


<hr>

### 9. Accuracy, recall, f1 score

In [37]:
class_report = classification_report(y_test, y_pred)
print(class_report)

              precision    recall  f1-score   support

           0       0.95      1.00      0.97      7440
           1       0.93      0.33      0.49       551

    accuracy                           0.95      7991
   macro avg       0.94      0.66      0.73      7991
weighted avg       0.95      0.95      0.94      7991



In [38]:
#9.1 Report the accuracy on the train set.
accuracy_score(y_test, y_pred)

0.9521962207483419

In [39]:
#9.2 Report the recall on the train set: decent, high, or low.


In [40]:
#9.3 Get the f1 score on the train set.
f1_score(y_test, y_pred)

0.4879356568364611

<hr>

### 10. Class imbalance

In [41]:
#10.1 Adjust the appropriate class in the LogisticRegression model.
df['label'].value_counts()

0    29720
1     2242
Name: label, dtype: int64

As we can see, the dataset is entirely imbalanced.

In [42]:
lr_balanced = LogisticRegression(class_weight='balanced')

<hr>

### 11. Train again and evaluate

In [43]:
#11.1 Train the model on the train set.
lr_balanced.fit(X_train, y_train)

LogisticRegression(class_weight='balanced')

In [44]:
y_pred2 = lr_balanced.predict(X_test)

In [45]:
#11.2 Evaluate the predictions on the train set: accuracy, recall, and f_1 score.
accuracy_score(y_test, y_pred2)

0.9167813790514329

In [46]:
f1_score(y_test, y_pred2)

0.573444515715202

<hr>

### 12. Regularization and hyperparameter tuning

In [47]:
#12.1 Import GridSearch and StratifiedKFold because of class imbalance.
# Already imported above...

In [48]:
#12.2 Provide the parameter grid to choose for ‘C’ and ‘penalty’ parameters.
params = {'C':[0.01, 0.1, 1, 10, 100], 'penalty':('l2', 'none'), 'max_iter': [100, 200, 300, 400]}
stkfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=40)

In [49]:
#12.3 Use a balanced class weight while instantiating the logistic regression.
log_reg_balanced = LogisticRegression(class_weight='balanced') #get the value of None

In [50]:
clf = GridSearchCV(log_reg_balanced, param_grid=params, cv = stkfold)

In [51]:
clf.fit(X_train, y_train)

  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https

  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to 

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=40, shuffle=True),
             estimator=LogisticRegression(class_weight='balanced'),
             param_grid={'C': [0.01, 0.1, 1, 10, 100],
                         'max_iter': [100, 200, 300, 400],
                         'penalty': ('l2', 'none')})

In [52]:
clf.best_params_

{'C': 0.01, 'max_iter': 100, 'penalty': 'none'}

In [53]:
clf.best_estimator_

LogisticRegression(C=0.01, class_weight='balanced', penalty='none')

<hr>

### 13. Find the parameters with the best recall in cross validation

In [54]:
#13.1 Choose ‘recall’ as the metric for scoring.
sc = 'recall'

In [55]:
#13.2 Choose stratified 4 fold cross validation scheme.
str_fold = 4

In [56]:
#13.3 Fit into  the train set.
model = cross_val_score(log_reg_balanced, X_train, y_train, scoring=sc, cv=str_fold)

In [57]:
print(model)

[0.78250591 0.80378251 0.80851064 0.81753555]


<hr>

### 14. What are the best parameters?

In [58]:
best_param = model.mean()

In [59]:
best_param

0.8030836498493048

<hr>

### 15 Predict and evaluate using the best estimator

In [60]:
#15.1 Use the best estimator from the grid search to make predictions on the test set.
# best estimator grid: LogisticRegression(C=0.01, class_weight='balanced', penalty='none')
best_estimator = LogisticRegression(C=0.01, class_weight='balanced', penalty='none')
best_estimator.fit(X_train, y_train)
y_pred = best_estimator.predict(X_test)

  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [61]:
#15.2 What is the recall on the test set for the toxic comments?
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      7440
           1       0.57      0.64      0.60       551

    accuracy                           0.94      7991
   macro avg       0.77      0.80      0.79      7991
weighted avg       0.95      0.94      0.94      7991



In [62]:
# recall on the test for toxic comments
print('The recall on toxic comments is 64%')

The recall on toxic comments is 64%


In [63]:
#15.3 What is the f_1 score?
print('The recall on toxic comments is 60%')

The recall on toxic comments is 60%
