In [57]:
# Reading and preprocessing data
import pandas as pd, numpy as np, re
df = pd.read_csv('D:/USF/Text Analytics/Class Presentations and Python Code Files/Week10-Deep Learning and Miscellanous Topics/Hate_Speech.csv', encoding ='latin1')
df.dtypes
df = df[['tweet', 'label']]
df.shape
np.unique(df['label'], return_counts=True)

(array([0, 1, 2], dtype=int64), array([ 4160, 19145,  1430], dtype=int64))

In [58]:
# pip install contractions
import contractions # Note: contractions is a library for converting words like "I'm" to "I am"
from nltk.corpus import stopwords
stop_words = stopwords.words("english")
stop_words.extend(["&amp;", "&gt;", "&lt;", "RT"])
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

#### Text Pre-Processing

In [59]:
tokenized_tweets = []
for t in df['tweet']:
    t = t.translate(t.maketrans('\n\t\r', '   '))
    t = contractions.fix(t)
    t = re.sub(r'http\S+', ' ', t)              # Drop URLs
    t = re.sub(r'@\S+', ' ', t)                 # Drop Twitter handles
    t = re.sub(r'#\S+', ' ', t)                 # Drop hashtags
    t = re.sub(r'[^a-zA-Z0-9\s]', '', t)        
    t = re.sub(' +', ' ', t)                    # Multiple spaces to single space
    words = t.lower().split()
    words = [w for w in words if len(w)>2 and w not in stop_words]
    words = [lemmatizer.lemmatize(w) for w in words]
    t = ' '.join(words)
    tokenized_tweets.append(t)

In [60]:
df["tokens"] = tokenized_tweets
df['tokens'].replace('', np.nan, inplace=True)  # Drop rows with no tokens
df.dropna(subset=['tokens'], inplace=True)
df.shape

(24711, 3)

In [61]:
np.unique(df['label'], return_counts=True)

(array([0, 1, 2], dtype=int64), array([ 4152, 19134,  1425], dtype=int64))

In [62]:
#Train and test split with 80:20 ratio
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.25) 
train.shape, test.shape

((18533, 3), (6178, 3))

18533 rows as training data set and 6178 as testing data set.

In [63]:
# Feature engineering
# Define embedding dimensions: max sequence lengths, max number of words
MAX_SEQ_LENGTH = 50        # If tweet seq length > MAX_SEQ_LENGTH, truncate; if less, pad with zeros.
MAX_NB_WORDS = 20000       # If word count is exceeded, take most frequent words
EMBEDDING_DIM = 100        
import tensorflow as tf
 # Convert tweets to features using keras Tokenizer since keras cannot processor words
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=MAX_NB_WORDS) 
tokenizer.fit_on_texts(train['tokens'])    
train_sequences = tokenizer.texts_to_sequences(train['tokens'])
test_sequences = tokenizer.texts_to_sequences(test['tokens'])

In [35]:
train_sequences

[[15, 6, 1543, 2],
 [703, 1],
 [7, 6, 65, 6685],
 [2993, 1681],
 [87, 1, 13, 4665, 80, 6686, 170],
 [134, 2594, 38, 42, 3, 9, 186, 80],
 [656, 6, 4666, 192],
 [745, 1149, 2595, 511, 2994, 101, 1, 387, 987, 49, 1226],
 [3, 39, 940, 218, 770, 310, 4667],
 [252, 988, 2, 497, 175],
 [6, 31, 88, 1],
 [1, 304, 3, 323, 4668, 6687, 2596, 311, 25],
 [2995, 3650, 295, 114, 812, 1150, 6688, 6689, 248, 771, 1422],
 [229, 173, 20, 77, 57, 202, 1, 388, 42, 223, 430, 9],
 [498, 389, 129, 2597],
 [41, 6690, 2598, 3651, 1318, 150, 404, 48, 13, 29, 3652],
 [219, 2, 17, 304, 538, 410, 582],
 [302, 5, 90, 525, 15, 5, 2996, 187, 138, 2997, 6, 2],
 [58, 1],
 [1423, 3, 4669, 269, 1, 6691, 941, 6692],
 [36, 112, 1042, 2599, 4670],
 [21, 379, 405, 4671, 1319, 1424, 6693, 1682, 71, 942, 3653],
 [2024, 102, 7, 1683, 4672, 115, 1],
 [1227, 346, 124, 564, 1151, 1151, 1684, 23, 564],
 [772, 4673, 2998, 4, 3654, 3655, 6694, 6695, 1685],
 [28, 423, 1, 6696, 355],
 [1152, 850, 1, 850, 87],
 [1686, 1, 4674, 6697],
 [37

The numbers returned above represents which words are present in each tweet

In [64]:
# Build dictionary of words and their index
word_index = tokenizer.word_index 
len(word_index)

# Get top frequent words in train and test data sets
from keras.preprocessing.sequence import pad_sequences
train_features = pad_sequences(train_sequences, maxlen=MAX_SEQ_LENGTH) 
test_features  = pad_sequences(test_sequences, maxlen=MAX_SEQ_LENGTH) 
train_features.shape, test_features.shape

((18533, 50), (6178, 50))

In [37]:
train_features[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,   15,    6, 1543,    2])

In [65]:
# Shallow NN: sklearn's MLPclassifier

# Encoding labels for deep learning
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder() 
encoder.fit(train['label'])
train_labels = encoder.transform(train['label'])
test_labels = encoder.transform(test['label'])

Data is ready

In [54]:
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(solver='adam', alpha=1e-5, learning_rate='adaptive', 
    early_stopping=True, activation = 'relu', hidden_layer_sizes=(512), 
    random_state=42)
#Hidden layer with 512 nodes.
model.fit(train_features, train_labels)
predicted = model.predict(test_features)
predicted

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [42]:
from sklearn.metrics import classification_report, confusion_matrix
print('Confusion Matrix:\n', confusion_matrix(test_labels, predicted.round()))
print(classification_report(test_labels, predicted.round()))

Confusion Matrix:
 [[  50  985   11]
 [  87 4635   48]
 [   8  350    4]]
              precision    recall  f1-score   support

           0       0.34      0.05      0.08      1046
           1       0.78      0.97      0.86      4770
           2       0.06      0.01      0.02       362

    accuracy                           0.76      6178
   macro avg       0.39      0.34      0.32      6178
weighted avg       0.66      0.76      0.68      6178



We will add three layers instead of a single layer

In [43]:
# Deep NN with 3 layers: sklearn's MLPclassifier

from sklearn.neural_network import MLPClassifier
model = MLPClassifier(solver='adam', alpha=1e-5, learning_rate='adaptive', 
    early_stopping=True, activation = 'relu', hidden_layer_sizes=(512, 128, 12), 
    random_state=42)
model.fit(train_features, train_labels)
predicted = model.predict(test_features)
predicted
print(classification_report(test_labels, predicted.round()))
# When you have insufficient data, you choice of classifier won't matter

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1046
           1       0.77      1.00      0.87      4770
           2       0.00      0.00      0.00       362

    accuracy                           0.77      6178
   macro avg       0.26      0.33      0.29      6178
weighted avg       0.60      0.77      0.67      6178



  'precision', 'predicted', average, warn_for)


Deep Neural network peformed worst in this scenario

In [45]:
# Deep NN with tensorflow keras
import keras.utils
# Keras requires categorical label classes
# Hence, labels must be transformed from 0, 1, 2 to three dummy variables
from keras.utils import to_categorical
train_labels = to_categorical(np.asarray(train['label']))
test_labels  = to_categorical(np.asarray(test['label']))
train_labels.shape, test_labels.shape
from keras.models import Sequential
from keras.layers import Embedding, Dropout, Dense
from keras.layers import Dense, Dropout, Flatten

model = Sequential()                                  # Build model
#Adding embedding layers
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQ_LENGTH))
model.add(Dropout(0.5))                               # Input layer
model.add(Flatten()) 


model.add(Dense(512, activation='relu'))              # Hidden layer 1
model.add(Dense(128, activation='relu'))              # Hidden layer 2
model.add(Dense(12, activation='relu'))               # Hidden layer 3

model.add(Dense(3, activation='softmax'))             # Output layer
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])

#Three Layers of Neural network --->Input Layer,Hidden Layer and Output Layer
model.fit(train_features, train_labels, batch_size=64, epochs=5,
    validation_data=(test_features, test_labels))    # Train model

predicted = model.predict(test_features)             # Evaluate model
predicted
print(classification_report(test_labels, predicted.round()))


Train on 18533 samples, validate on 6178 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

           0       0.86      0.85      0.86      1046
           1       0.93      0.96      0.94      4770
           2       0.61      0.24      0.35       362

   micro avg       0.91      0.90      0.90      6178
   macro avg       0.80      0.69      0.72      6178
weighted avg       0.90      0.90      0.89      6178
 samples avg       0.90      0.90      0.90      6178



  'precision', 'predicted', average, warn_for)


Precision and recall are pretty low for hate speech since we have very few rows in that category

In [46]:

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder() 
encoder.fit(train['label'])
train_labels = encoder.transform(train['label'])
test_labels = encoder.transform(test['label'])

from sklearn.metrics import confusion_matrix, classification_report
  
def run_model(classifier, train_x, train_y, test_x, test_y):
    classifier.fit(train_x, train_y)  
    predict_y = classifier.predict(test_x)
    print('Confusion Matrix:\n', confusion_matrix(test_y, predict_y))
    print('Classification Report:\n', classification_report(test_y, predict_y))
    return 

# Naive Bayes
from sklearn.naive_bayes import MultinomialNB
run_model(MultinomialNB(), train_features, train_labels, test_features, test_labels)

Confusion Matrix:
 [[ 222  792   32]
 [ 876 3711  183]
 [  72  278   12]]
Classification Report:
               precision    recall  f1-score   support

           0       0.19      0.21      0.20      1046
           1       0.78      0.78      0.78      4770
           2       0.05      0.03      0.04       362

    accuracy                           0.64      6178
   macro avg       0.34      0.34      0.34      6178
weighted avg       0.63      0.64      0.64      6178



In [47]:
# Logistic Regression & Stochastic Gradient Descent
from sklearn.linear_model import LogisticRegression, SGDClassifier
run_model(LogisticRegression(), train_features, train_labels, test_features, test_labels)
run_model(SGDClassifier(), train_features, train_labels, test_features, test_labels)

  'precision', 'predicted', average, warn_for)


Confusion Matrix:
 [[  12 1034    0]
 [  48 4722    0]
 [   7  355    0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.18      0.01      0.02      1046
           1       0.77      0.99      0.87      4770
           2       0.00      0.00      0.00       362

    accuracy                           0.77      6178
   macro avg       0.32      0.33      0.30      6178
weighted avg       0.63      0.77      0.67      6178

Confusion Matrix:
 [[  82  956    8]
 [ 384 4349   37]
 [  33  324    5]]
Classification Report:
               precision    recall  f1-score   support

           0       0.16      0.08      0.11      1046
           1       0.77      0.91      0.84      4770
           2       0.10      0.01      0.02       362

    accuracy                           0.72      6178
   macro avg       0.35      0.33      0.32      6178
weighted avg       0.63      0.72      0.67      6178



In [48]:
# Support Vector Machines
from sklearn.svm import LinearSVC
run_model(LinearSVC(), train_features, train_labels, test_features, test_labels)

Confusion Matrix:
 [[ 487  391  168]
 [1661 2550  559]
 [ 117  190   55]]
Classification Report:
               precision    recall  f1-score   support

           0       0.22      0.47      0.29      1046
           1       0.81      0.53      0.65      4770
           2       0.07      0.15      0.10       362

    accuracy                           0.50      6178
   macro avg       0.37      0.38      0.35      6178
weighted avg       0.67      0.50      0.55      6178





In [49]:
# Random Forest (Bagging Model)
from sklearn.ensemble import RandomForestClassifier
run_model(RandomForestClassifier(), train_features, train_labels, test_features, test_labels)



Confusion Matrix:
 [[ 602  430   14]
 [ 245 4490   35]
 [  59  276   27]]
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.58      0.62      1046
           1       0.86      0.94      0.90      4770
           2       0.36      0.07      0.12       362

    accuracy                           0.83      6178
   macro avg       0.63      0.53      0.55      6178
weighted avg       0.80      0.83      0.81      6178



In [50]:
# XGBoost (Boosting Model)
# pip install xgboost
from xgboost import XGBClassifier
run_model(XGBClassifier(), train_features, train_labels, test_features, test_labels)

Confusion Matrix:
 [[ 753  289    4]
 [ 197 4540   33]
 [  53  272   37]]
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.72      0.73      1046
           1       0.89      0.95      0.92      4770
           2       0.50      0.10      0.17       362

    accuracy                           0.86      6178
   macro avg       0.71      0.59      0.61      6178
weighted avg       0.84      0.86      0.84      6178



In [70]:
from keras.utils import to_categorical
train_labels = to_categorical(np.asarray(train['label']))
test_labels  = to_categorical(np.asarray(test['label']))
# Convolutional Neural Network: ConvID
from keras.models import Sequential
from keras.layers import Embedding, Dropout, Conv1D, MaxPooling1D
from keras.layers import BatchNormalization, Flatten, Dense

model = Sequential()                                # Build model
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQ_LENGTH))
model.add(Dropout(0.5))                             # Input layer

#Convulational layers used to reduce number of features
model.add(Conv1D(128, 5, activation='relu'))        # Convolutional layer 1
model.add(MaxPooling1D(5))
model.add(Dropout(0.5))
model.add(BatchNormalization())

model.add(Conv1D(128, 5, activation='relu'))        # Convolutional layer 2
model.add(MaxPooling1D(5))
model.add(Dropout(0.5))
model.add(BatchNormalization())

model.add(Flatten())                     
model.add(Dense(128, activation='relu'))            # Hidden layer
model.add(Dense(3, activation='softmax'))           # Output layer
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])

In [71]:
model.fit(train_features, train_labels, batch_size=64, epochs=5,
    validation_data=(test_features, test_labels))       # Train model

Train on 18533 samples, validate on 6178 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x1ee1d2c22c8>

In [72]:
predicted = model.predict(test_features)
predicted
print(classification_report(test_labels, predicted.round()))

              precision    recall  f1-score   support

           0       0.78      0.63      0.70      1038
           1       0.88      0.96      0.92      4792
           2       0.00      0.00      0.00       348

   micro avg       0.86      0.85      0.86      6178
   macro avg       0.55      0.53      0.54      6178
weighted avg       0.81      0.85      0.83      6178
 samples avg       0.85      0.85      0.85      6178



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [73]:

# Recurrent Neural Network: SimpleRNN

from keras.layers.recurrent import SimpleRNN

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQ_LENGTH)) #Sequential layer
model.add(SimpleRNN(2, input_shape=(None,1)))
model.add(Dense(3,activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(train_features, train_labels, batch_size=16, epochs=5,
          validation_data=(test_features, test_labels))

predicted = model.predict(test_features)
predicted
print(classification_report(test_labels, predicted.round()))

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 18533 samples, validate on 6178 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

           0       0.76      0.72      0.74      1038
           1       0.92      0.94      0.93      4792
           2       0.27      0.05      0.09       348

   micro avg       0.88      0.86      0.87      6178
   macro avg       0.65      0.57      0.59      6178
weighted avg       0.85      0.86      0.85      6178
 samples avg       0.86      0.86      0.86      6178



  'precision', 'predicted', average, warn_for)


In [74]:
# RNN: LSTM

from keras.layers import LSTM

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQ_LENGTH))
model.add(LSTM(output_dim=16, activation='relu', 
               inner_activation='hard_sigmoid',return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Flatten())
model.add(Dense(3, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

model.fit(train_features, train_labels, batch_size=16, epochs=3,
          validation_data=(test_features, test_labels))

predicted = model.predict(test_features)
predicted
print(classification_report(test_labels, predicted.round()))

  


Train on 18533 samples, validate on 6178 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
              precision    recall  f1-score   support

           0       0.87      0.74      0.80      1038
           1       0.91      0.96      0.93      4792
           2       0.40      0.19      0.26       348

   micro avg       0.89      0.88      0.88      6178
   macro avg       0.72      0.63      0.66      6178
weighted avg       0.87      0.88      0.87      6178
 samples avg       0.88      0.88      0.88      6178



  'precision', 'predicted', average, warn_for)


In [75]:
# RNN: Bidirectional LSTM

from keras.layers import Bidirectional, GlobalMaxPool1D

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQ_LENGTH))
model.add(Bidirectional(LSTM(16, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)))
model.add(Conv1D(16, kernel_size=3, padding="valid", kernel_initializer="glorot_uniform"))
model.add(GlobalMaxPool1D())
model.add(Dense(50, activation="relu"))
model.add(Dropout(0.1))
model.add(Dense(3, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

model.fit(train_features, train_labels, batch_size=16, epochs=3,
          validation_data=(test_features, test_labels))

predicted = model.predict(test_features)
predicted
print(classification_report(test_labels, predicted.round()))

Train on 18533 samples, validate on 6178 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
              precision    recall  f1-score   support

           0       0.84      0.84      0.84      1038
           1       0.93      0.96      0.94      4792
           2       0.56      0.20      0.30       348

   micro avg       0.90      0.90      0.90      6178
   macro avg       0.78      0.67      0.69      6178
weighted avg       0.89      0.90      0.89      6178
 samples avg       0.90      0.90      0.90      6178



  'precision', 'predicted', average, warn_for)
