In [17]:
## This notebook is just a collection of some baseline sentiment models
# Most code is coming from https://github.com/abdulfatir/twitter-sentiment-analysis
# Dataset is from Kaggle: https://www.kaggle.com/arkhoshghalb/twitter-sentiment-analysis-hatred-speech
# Use small portion of data for quick test, no feature engineering included

In [18]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Sklearn
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *

# Keras
from keras.layers import Dense
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding, Flatten
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.preprocessing.sequence import pad_sequences

from xgboost import XGBClassifier

In [71]:
# Use small portion of data for quick test
df = pd.read_csv("/kaggle/input/twitter-sentiment-analysis-hatred-speech/train.csv")

train_df = df.iloc[:3000]
valid_df = df.iloc[3000:4000].reset_index(drop=True)

In [72]:
tfidf = TfidfVectorizer()

tfidf_train = tfidf.fit_transform(train_df.tweet.values)
tfidf_valid = tfidf.transform(valid_df.tweet.values)

# SVM

In [73]:
model = LinearSVC(C=0.1)
model.fit(tfidf_train, train_df.label)

y_pred = model.predict(tfidf_valid)
print(classification_report(valid_df.label, y_pred))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96       925
           1       1.00      0.05      0.10        75

    accuracy                           0.93      1000
   macro avg       0.96      0.53      0.53      1000
weighted avg       0.93      0.93      0.90      1000



# Naive Bayes

In [74]:
model = MultinomialNB()
model.partial_fit(tfidf_train, train_df.label, classes=[0, 1])

y_pred = model.predict(tfidf_valid)
print(classification_report(valid_df.label, y_pred))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96       925
           1       1.00      0.07      0.12        75

    accuracy                           0.93      1000
   macro avg       0.96      0.53      0.54      1000
weighted avg       0.93      0.93      0.90      1000



# Random Forest

In [75]:
model = RandomForestClassifier()
model.fit(tfidf_train, train_df.label)

y_pred = model.predict(tfidf_valid)
print(classification_report(valid_df.label, y_pred))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       925
           1       1.00      0.15      0.26        75

    accuracy                           0.94      1000
   macro avg       0.97      0.57      0.61      1000
weighted avg       0.94      0.94      0.91      1000



# Logistic Regression

In [76]:
model = LogisticRegression()
model.fit(tfidf_train, train_df.label)

y_pred = model.predict(tfidf_valid)
print(classification_report(valid_df.label, y_pred))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96       925
           1       1.00      0.05      0.10        75

    accuracy                           0.93      1000
   macro avg       0.96      0.53      0.53      1000
weighted avg       0.93      0.93      0.90      1000



# Logistic Regression (keras)

In [77]:
def build_model(input_dim):
    model = Sequential()
    model.add(Dense(1, input_dim=input_dim, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = build_model(len(tfidf.vocabulary_))
model.fit(tfidf_train.toarray(), train_df.label, epochs=1)

y_pred = model.predict(tfidf_valid.toarray())
print(classification_report(valid_df.label, y_pred.ravel() > 0.5))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96       925
           1       0.00      0.00      0.00        75

    accuracy                           0.93      1000
   macro avg       0.46      0.50      0.48      1000
weighted avg       0.86      0.93      0.89      1000



  _warn_prf(average, modifier, msg_start, len(result))


# XGB

In [78]:
model = XGBClassifier(max_depth=25, silent=False, n_estimators=400)
model.fit(tfidf_train, train_df.label)

y_pred = model.predict(tfidf_valid)
print(classification_report(valid_df.label, y_pred))



Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


              precision    recall  f1-score   support

           0       0.94      0.99      0.96       925
           1       0.67      0.16      0.26        75

    accuracy                           0.93      1000
   macro avg       0.80      0.58      0.61      1000
weighted avg       0.92      0.93      0.91      1000



# DenseNet

In [79]:
# def build_model():
#     model = Sequential()
#     model.add(Dense(500, input_dim=VOCAB_SIZE, activation='sigmoid'))
#     model.add(Dense(1, activation='sigmoid'))
#     model.compile(loss='binary_crossentropy',
#                   optimizer='adam', metrics=['accuracy'])
#     return model

# CNN

In [80]:
## https://github.com/abdulfatir/twitter-sentiment-analysis/blob/master/code/cnn.py

GLOVE_FILE = "/kaggle/input/glove6b/glove.6B.50d.txt"

def get_glove_vectors(vocab):
    """
    Extracts glove vectors from seed file only for words present in vocab.
    """
    glove_vectors = {}
    with open(GLOVE_FILE, 'r') as glove_file:
        for i, line in enumerate(glove_file):
            tokens = line.strip().split()
            word = tokens[0]
            if vocab.get(word):
                vector = [float(e) for e in tokens[1:]]
                glove_vectors[word] = np.array(vector)
    return glove_vectors

def build_model(vocab_size, kernel_size, dim, embedding_matrix, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size + 1, dim, weights=[embedding_matrix], input_length=max_length))
    model.add(Dropout(0.4))
    model.add(Conv1D(600, kernel_size, padding='valid', activation='relu', strides=1))
    model.add(Conv1D(300, kernel_size, padding='valid', activation='relu', strides=1))
    model.add(Conv1D(150, kernel_size, padding='valid', activation='relu', strides=1))
    model.add(Conv1D(75, kernel_size, padding='valid', activation='relu', strides=1))
    model.add(Flatten())
    model.add(Dense(600))
    model.add(Dropout(0.5))
    model.add(Activation('relu'))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def get_feature_vector(text, vocab):
    """
    Generates a feature vector for each tweet where each word is
    represented by integer index based on rank in vocabulary.
    """
    words = text.split()
    feature_vector = []
    for i in range(len(words) - 1):
        word = words[i]
        if vocab.get(word) is not None:
            feature_vector.append(vocab.get(word))
    if len(words) >= 1:
        if vocab.get(words[-1]) is not None:
            feature_vector.append(vocab.get(words[-1]))
    return feature_vector

def process_texts(texts, vocab):
    vecs = []
    for text in texts:
        feature_vector = get_feature_vector(text, vocab)
        vecs.append(feature_vector)
    return vecs

In [81]:
kernel_size = 3
dim = 50
max_length = 64

vocab = tfidf.vocabulary_
vocab_size = len(vocab)

glove_vectors = get_glove_vectors(vocab)
embedding_matrix = np.random.randn(vocab_size + 1, dim) * 0.01

for word, i in vocab.items():
    glove_vector = glove_vectors.get(word)
    if glove_vector is not None:
        embedding_matrix[i] = glove_vector

In [82]:
train_vecs = process_texts(train_df.tweet.values, vocab)
valid_vecs = process_texts(valid_df.tweet.values, vocab)

train_vecs = pad_sequences(train_vecs, maxlen=max_length, padding='post')
valid_vecs = pad_sequences(valid_vecs, maxlen=max_length, padding='post')

model = build_model(vocab_size, kernel_size, dim, embedding_matrix, max_length)

# filepath = "./models/4cnn-{epoch:02d}-{loss:0.3f}-{acc:0.3f}-{val_loss:0.3f}-{val_acc:0.3f}.hdf5"
# checkpoint = ModelCheckpoint(filepath, monitor="loss", verbose=1, save_best_only=True, mode='min')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=0.000001)
model.fit(train_vecs, train_df.label, batch_size=128, epochs=8, validation_split=0.1, shuffle=True, callbacks=[reduce_lr])

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<tensorflow.python.keras.callbacks.History at 0x7f90471f1910>

In [83]:
y_pred = model.predict(valid_vecs)
print(classification_report(valid_df.label, y_pred.ravel() > 0.5))

              precision    recall  f1-score   support

           0       0.93      0.99      0.96       925
           1       0.38      0.07      0.11        75

    accuracy                           0.92      1000
   macro avg       0.66      0.53      0.54      1000
weighted avg       0.89      0.92      0.90      1000



# LSTM

In [84]:
def build_model(vocab_size, dim, embedding_matrix, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size + 1, dim, weights=[embedding_matrix], input_length=max_length))
    model.add(Dropout(0.4))
    model.add(LSTM(128))
    model.add(Dense(64))
    model.add(Dropout(0.5))
    model.add(Activation('relu'))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [85]:
model = build_model(vocab_size, dim, embedding_matrix, max_length)

In [86]:
## Val_accuracy does not go up???
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=0.000001)
model.fit(train_vecs, train_df.label, batch_size=32, epochs=8, validation_split=0.1, shuffle=True, callbacks=[reduce_lr])

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<tensorflow.python.keras.callbacks.History at 0x7f8f4437b8d0>

In [None]:
y_pred = model.predict(valid_vecs)
print(classification_report(valid_df.label, y_pred.ravel() > 0.5))

# BERT's family

In [36]:
!pip install simpletransformers



You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [37]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs

In [38]:
train_df = train_df.rename(columns={'tweet': 'text', 'label': 'labels'})
valid_df = valid_df.rename(columns={'tweet': 'text', 'label': 'labels'})

# DistilBERT

In [68]:
model_args = ClassificationArgs(num_train_epochs=3, overwrite_output_dir=True, train_batch_size=16)
model = ClassificationModel(
    "distilbert", "distilbert-base-uncased", args=model_args
)
model.train_model(train_df)
predictions, raw_outputs = model.predict(valid_df.text.values)
print(classification_report(valid_df.labels, predictions))

Downloading:   0%|          | 0.00/442 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

  0%|          | 0/3000 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/188 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/188 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/188 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       0.96      0.98      0.97       925
           1       0.67      0.56      0.61        75

    accuracy                           0.95      1000
   macro avg       0.82      0.77      0.79      1000
weighted avg       0.94      0.95      0.94      1000



# Bert

In [55]:
model_args = ClassificationArgs(num_train_epochs=3, overwrite_output_dir=True, train_batch_size=16)
model = ClassificationModel(
    "bert", "bert-base-uncased", args=model_args
)

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [56]:
model.train_model(train_df)
predictions, raw_outputs = model.predict(valid_df.text.values)
print(classification_report(valid_df.labels, predictions))

  0%|          | 0/3000 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/188 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/188 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/188 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       0.96      0.98      0.97       925
           1       0.67      0.51      0.58        75

    accuracy                           0.94      1000
   macro avg       0.81      0.74      0.77      1000
weighted avg       0.94      0.94      0.94      1000



# Roberta-base

In [39]:
model_args = ClassificationArgs(num_train_epochs=3, overwrite_output_dir=True, train_batch_size=16)
model = ClassificationModel(
    "roberta", "roberta-base", args=model_args
)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out

In [49]:
# Train the model
model.train_model(train_df)

  0%|          | 0/3000 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/188 [00:00<?, ?it/s]



Running Epoch 1 of 3:   0%|          | 0/188 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/188 [00:00<?, ?it/s]

(564, 0.14492184107127049)

In [52]:
predictions, raw_outputs = model.predict(valid_df.text.values)

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

In [53]:
print(classification_report(valid_df.labels, predictions))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98       925
           1       0.77      0.65      0.71        75

    accuracy                           0.96      1000
   macro avg       0.87      0.82      0.84      1000
weighted avg       0.96      0.96      0.96      1000



# XLNET

In [67]:
model_args = ClassificationArgs(num_train_epochs=3, overwrite_output_dir=True, train_batch_size=16)
model = ClassificationModel(
    "xlnet", "xlnet-base-cased", args=model_args
)
model.train_model(train_df)
predictions, raw_outputs = model.predict(valid_df.text.values)
print(classification_report(valid_df.labels, predictions))

Downloading:   0%|          | 0.00/760 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

  0%|          | 0/3000 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/188 [00:00<?, ?it/s]



Running Epoch 1 of 3:   0%|          | 0/188 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/188 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       0.95      0.98      0.97       925
           1       0.65      0.40      0.50        75

    accuracy                           0.94      1000
   macro avg       0.80      0.69      0.73      1000
weighted avg       0.93      0.94      0.93      1000

