### Set Up

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

In [2]:
ROOT_DIR = "/content/drive/MyDrive/content/drive"

In [None]:
!pip install transformers

In [None]:
!pip install tweet-preprocessor

In [None]:
!pip install ekphrasis

In [None]:
!pip install imbalanced-learn

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
nltk.download('averaged_perceptron_tagger')

In [None]:
nltk.download('omw-1.4')

In [None]:
!pip install sentence-transformers

In [None]:
!pip install tokenizers

### Loading Data

In [None]:
import pandas as pd
import os
from os.path import join
col_list = ["text", "label"]

def load_dataframe(folder, name, separator, encod):
    path = join(ROOT_DIR, 'datasets', folder, name)
    data = pd.read_csv(path, sep=separator, encoding=encod, engine='python', usecols=col_list)
    return data

In [None]:
data_train = load_dataframe("data1", "train.csv", "¦", "utf-8")
data_test = load_dataframe("data1", "test.csv", "¦", "utf-8")
data_validation = load_dataframe("data1", "validation.csv", "¦", "utf-8")

### Save

In [None]:
import pickle
def save_model(model, name):
  path=ROOT_DIR +'/models/'+name+".pickle";
  pickle.dump(model, open(path, 'wb'))

### Preprocessing

In [None]:
from ekphrasis.classes.segmenter import Segmenter
seg_tw = Segmenter(corpus="twitter")

In [None]:
from nltk.corpus import stopwords, words
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
import string
from string import punctuation, digits
import preprocessor as p
from nltk.tokenize import TweetTokenizer
from nltk.corpus import wordnet


tokenizer1 = TweetTokenizer()



stop = set(stopwords.words('english'))
punct = list(string.punctuation)
stop.update(punct)

def preprocess_tweets(text):
    return p.clean(text)


def remove_stopwords(text):
    final_text = []
    text = text.split()
    for i in text:
        if i not in stop:
            final_text.append(i)
    return " ".join(final_text)


def delete_digits(text):
    clean = text.translate(str.maketrans('', '', digits))
    return clean


def delete_punctuation(text):
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    clean = text.translate(translator)
    return clean


def denoise_text(text):
    text = preprocess_tweets(text)
    text = text.lower()
    text = delete_punctuation(text)
    text = delete_digits(text)
    return text

def get_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)


def lemm(text):
    lemmatizer = WordNetLemmatizer()
    sar_list_lemmatizer = [lemmatizer.lemmatize(word, get_pos(word)) for word in text]
    return " ".join(sar_list_lemmatizer)



def get_hashtags(text):
    tokens = tokenizer1.tokenize(text)
    n=len(tokens)
    norm_arr=[]
    
    for i in range(n):
      if tokens[i].startswith("#"):
          for word in seg_tw.segment(tokens[i]).split():
            norm_arr.append(word)
      else:
        norm_arr.append(tokens[i])
    return " ".join(norm_arr)

def get_vocab(text):
    temp = text.copy()
    vocab = text.apply(get_hashtags).apply(denoise_text).apply(remove_stopwords).apply(word_tokenize).apply(lemm)
    vocab=vocab.tolist()
    return vocab

def get_vocab_bert(text):
    temp = text.copy()
    vocab = text.apply(get_hashtags).apply(denoise_text).apply(remove_stopwords)
    vocab=vocab.tolist()
    return vocab


### F1-metric

In [None]:
import keras.backend as K
import tensorflow as tf
def f1_metric(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

def f1_loss(y_true, y_pred):
    
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return 1 - K.mean(f1)

### Threshold

In [None]:
import matplotlib.pyplot as plt

In [None]:
from sklearn.metrics import roc_curve
import numpy as np
def roc_curve_threshold(predicted_prob, y_labels):
  fpr, tpr, thresholds = roc_curve(y_labels, predicted_prob)
  # Youden’s J statistic.
  J = tpr - fpr
  ix = np.argmax(J)
  print('Best Threshold=%f' % (thresholds[ix]))
  # plot the roc curve for the model
  plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
  plt.plot(fpr, tpr, marker='.', label='Logistic')
  plt.scatter(fpr[ix], tpr[ix], marker='o', color='black', label='Best')
  # axis labels
  plt.xlabel('False Positive Rate')
  plt.ylabel('True Positive Rate')
  plt.legend()
  # show the plot
  plt.show()
  
  return thresholds[ix]

In [None]:
from sklearn.metrics import precision_recall_curve
def pr_curve_threshold(yhat, testy):
  precision, recall, thresholds = precision_recall_curve(testy, yhat)
  # convert to f score
  fscore = (2 * precision * recall) / (precision + recall)
  # locate the index of the largest f score
  ix = np.argmax(fscore)
  print('Best Threshold=%f, F-Score=%.3f' % (thresholds[ix], fscore[ix]))
  # plot the roc curve for the model
  no_skill = len(testy[testy==1]) / len(testy)
  plt.plot([0,1], [no_skill,no_skill], linestyle='--', label='No Skill')
  plt.plot(recall, precision, marker='.', label='Logistic')
  plt.scatter(recall[ix], precision[ix], marker='o', color='black', label='Best')
  # axis labels
  plt.xlabel('Recall')
  plt.ylabel('Precision')
  plt.legend()
  # show the plot
  plt.show()
  return thresholds[ix]

In [None]:
def threshold(value, opt_threshold):
  if value>opt_threshold:
    return 1
  else:
     return 0

### Evaluate

In [None]:
def evaluate_loaded_model(test_val, valid_val, name):
  loaded_model = pickle.load(open(ROOT_DIR+'/models/'+name+'.pickle', 'rb'))
  predicted = loaded_model.predict_proba(test_val)
  opt_threshold = pr_curve_threshold(loaded_model.predict_proba(valid_val)[:, 1], data_validation["label"])
  predicted = [threshold(pred, opt_threshold) for pred in 
                predicted[:, 1]]
  print(metrics.classification_report(data_test["label"], predicted, digits=4))

In [None]:
def evaluate_model(test_val, valid_val, model):
  predicted = model.predict(test_val)
  opt_threshold = pr_curve_threshold(model.predict(valid_val), data_validation["label"])
  predicted = [threshold(pred, opt_threshold) for pred in 
                predicted]
  print(metrics.classification_report(data_test["label"], predicted, digits=4))

### Sampling

##### Random

In [None]:
def random_over_sampling(data):
    size = len(data[data['label'] == 0])
    over_sampled_data = pd.concat([data[data['label'] == 0], data[data['label'] == 1].sample(size, replace=True)])
    return over_sampled_data

def random_under_sampling(data):
    size = len(data[data['label'] == 1])
    under_sampled_data = pd.concat([data[data['label'] == 0].sample(size), data[data['label'] == 1]])
    return under_sampled_data

In [None]:
under_random_train = random_under_sampling(data_train)

In [None]:
under_random_train_text = get_vocab(under_random_train["text"])

In [None]:
under_random_train_text_bert = get_vocab_bert(under_random_train["text"])

In [None]:
over_random_train = random_over_sampling(data_train)

In [None]:
over_random_train_text = get_vocab(over_random_train["text"])

In [None]:
over_random_train_text_bert = get_vocab(over_random_train["text"])

##### SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
def smote_oversampling(data_text, data_label):
  oversample = SMOTE()
  X, y = oversample.fit_resample(data_text, data_label)
  return X, y

##### SMOTE + RandomUnderBalancing

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
def smote_over_undersampling(data_text, data_label):
  over = SMOTE(sampling_strategy=0.5)
  under = RandomUnderSampler(sampling_strategy=0.5)
  steps = [('o', over), ('u', under)]
  pipeline = Pipeline(steps=steps)
  X, y = pipeline.fit_resample(data_text, data_label)
  return X, y

### Tf.idf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
def tf_idf_vectorizer(corpus):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)
    return X, vectorizer

### Count vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
def count_vectorizer(corpus):
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(corpus)
    return X, vectorizer

### Naive Bayes

In [None]:
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics

In [None]:
def naive_Bayes_Classif(x_train, x_test, y_train, y_test, name):
  BNB = BernoulliNB()
  
  BNB.fit(x_train, y_train)
  predicted = BNB.predict(x_test)
  print(metrics.classification_report(y_test, predicted, digits=4))
  save_model(BNB, name)
  return BNB


In [None]:
test_text = get_vocab(data_test["text"])
valid_text = get_vocab(data_validation["text"])
train_text = get_vocab(data_train["text"])

#### Count Vectorizer

##### No sampling

In [None]:
cv_train, cv = count_vectorizer(train_text)
cv_test =  cv.transform(test_text)
cv_valid =  cv.transform(valid_text)

In [None]:
naive_Bayes_Classif(cv_train, cv_test, data_train["label"], data_test["label"], "bayes_naiv")

In [None]:
evaluate_loaded_model(cv_test, cv_valid, 'bayes_naiv')

##### SMOTE

In [None]:
from collections import Counter

In [None]:
over_smote_train_text, over_smote_train_label = smote_oversampling(cv_train,data_train["label"])

In [None]:
naive_Bayes_Classif(over_smote_train_text, cv_test, over_smote_train_label, data_test["label"], "naive_bayes_smote")

In [None]:
over_smote_under_train_text, over_smote_under_train_label = smote_over_undersampling(cv_train,data_train["label"])
print(Counter(over_smote_under_train_label))

In [None]:
naive_Bayes_Classif(over_smote_under_train_text, cv_test, over_smote_under_train_label, data_test["label"], "naive_bayes_smote_under")

##### Random Sampling

In [None]:
cv_train, cv = count_vectorizer(under_random_train_text)
cv_test =  cv.transform(test_text)
naive_Bayes_Classif(cv_train, cv_test, under_random_train["label"], data_test["label"], "naive_bayes_random_under")

In [None]:
cv_train, cv = count_vectorizer(over_random_train_text)
cv_test =  cv.transform(test_text)
naive_Bayes_Classif(cv_train, cv_test, over_random_train["label"], data_test["label"], "naive_bayes_random_over" )

#### Tf.idf

##### No Sampling

In [None]:
tf_train, tf = tf_idf_vectorizer(train_text)
tf_test =  tf.transform(test_text)
tf_valid =  tf.transform(valid_text)

In [None]:
naive_Bayes_Classif(tf_train, tf_test, data_train["label"], data_test["label"], "bayes_naiv_tf")

##### Random Oversampling

In [None]:
tf_train, tf = tf_idf_vectorizer(under_random_train_text)
tf_test =  tf.transform(test_text)
tf_valid =  tf.transform(valid_text)
naive_Bayes_Classif(tf_train, tf_test, under_random_train["label"], data_test["label"], "naive_bayes_under_tf")

In [None]:
tf_train, tf = tf_idf_vectorizer(over_random_train_text)
tf_test =  tf.transform(test_text)
tf_valid =  tf.transform(valid_text)
naive_Bayes_Classif(tf_train, tf_test, over_random_train["label"], data_test["label"],"naive_bayes_over_tf")

##### SMOTE

In [None]:
over_smote_train_text, over_smote_train_label = smote_oversampling(tf_train,data_train["label"])
naive_Bayes_Classif(over_smote_train_text, tf_test, over_smote_train_label, data_test["label"],"naive_bayes_smote_tf")

In [None]:
over_smote_under_train_text, over_smote_under_train_label = smote_over_undersampling(tf_train,data_train["label"])
naive_Bayes_Classif(over_smote_under_train_text, tf_test, over_smote_under_train_label, data_test["label"],"naive_bayes_smote_under_tf")

In [None]:
evaluate_loaded_model(tf_test, tf_valid, 'naive_bayes_smote_tf')

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
def logistic_regression_classif(x_train, x_test, y_train, y_test, name,value):
  lr = LogisticRegression(verbose=1, solver='liblinear', C=0.05, penalty='l2',max_iter=1000, class_weight=value)
  
  lr.fit(x_train, y_train)
  predicted = lr.predict(x_test)
  print(metrics.classification_report(y_test, predicted, digits=4))
  save_model(lr, name)
  return lr

In [None]:
test_text = get_vocab(data_test["text"])
valid_text = get_vocab(data_validation["text"])
train_text = get_vocab(data_train["text"])

#### Count Vectorizer

##### No Sampling

In [None]:
cv_train, cv = count_vectorizer(train_text)
cv_test =  cv.transform(test_text)
cv_valid =  cv.transform(valid_text)

In [None]:
logistic_regression_classif(cv_train, cv_test, data_train["label"], data_test["label"], "logistic_regression_balanced",'balanced')

In [None]:
evaluate_loaded_model(cv_test, cv_valid, 'logistic_regression_balanced')

##### Random Sampling

In [None]:
cv_train, cv = count_vectorizer(under_random_train_text)
cv_test =  cv.transform(test_text)
cv_valid =  cv.transform(valid_text)
logistic_regression_classif(cv_train, cv_test, under_random_train["label"], data_test["label"], "logistic_regression_under", None)

In [None]:
cv_train, cv = count_vectorizer(over_random_train_text)
cv_test =  cv.transform(test_text)
cv_valid =  cv.transform(valid_text)
logistic_regression_classif(cv_train, cv_test, over_random_train["label"], data_test["label"], "logistic_regression_over", None)

##### SMOTE

In [None]:
from collections import Counter

In [None]:
over_smote_train_text, over_smote_train_label = smote_oversampling(cv_train,data_train["label"])

In [None]:
logistic_regression_classif(over_smote_train_text, cv_test, over_smote_train_label, data_test["label"], "logistic_regression_smote", None)

In [None]:
over_smote_under_train_text, over_smote_under_train_label = smote_over_undersampling(cv_train,data_train["label"])
print(Counter(over_smote_under_train_label))

In [None]:
logistic_regression_classif(over_smote_under_train_text, cv_test, over_smote_under_train_label, data_test["label"], "logistic_regression_smote_under", None)

#### Tf.idf

##### No Sampling

In [None]:
tf_train, tf = tf_idf_vectorizer(train_text)
tf_test =  tf.transform(test_text)
tf_valid =  tf.transform(valid_text)

In [None]:
logistic_regression_classif(tf_train, tf_test, data_train["label"], data_test["label"], "logistic_regression_balanced_tf",'balanced')

##### Random Sampling

In [None]:
tf_train, tf = tf_idf_vectorizer(under_random_train_text)
tf_test =  tf.transform(test_text)
tf_valid =  tf.transform(valid_text)
logistic_regression_classif(tf_train, tf_test, under_random_train["label"], data_test["label"], "logistic_regression_under_tf",None)

In [None]:
tf_train, tf = tf_idf_vectorizer(over_random_train_text)
tf_test =  tf.transform(test_text)
tf_valid =  tf.transform(valid_text)
logistic_regression_classif(tf_train, tf_test, over_random_train["label"], data_test["label"],"logistic_regression_over_tf", None)

##### SMOTE

In [None]:
over_smote_train_text, over_smote_train_label = smote_oversampling(tf_train,data_train["label"])
logistic_regression_classif(over_smote_train_text, tf_test, over_smote_train_label, data_test["label"],"logistic_regression_smote_tf", None)

In [None]:
over_smote_under_train_text, over_smote_under_train_label = smote_over_undersampling(tf_train,data_train["label"])
logistic_regression_classif(over_smote_under_train_text, tf_test, over_smote_under_train_label, data_test["label"],"logistic_regression_smote_under_tf",'balanced')

In [None]:
evaluate_loaded_model(cv_test, cv_valid, 'logistic_regression_smote_under_tf')

### Word2Vec

In [None]:
import gensim
from gensim.models import Word2Vec
from tensorflow.keras import models, layers, preprocessing as kprocessing
import matplotlib.pyplot as plt
from sklearn import metrics
import numpy as np
from keras.callbacks import EarlyStopping
from sklearn.utils import class_weight
def w2v_embedding(train_text):
  ## create list of lists of unigrams
  lst_corpus = []
  for string in train_text:
    lst_words = string.split()
    lst_grams = [" ".join(lst_words[i:i+1]) 
                for i in range(0, len(lst_words), 1)]
    lst_corpus.append(lst_grams)

  ## detect bigrams and trigrams
  bigrams_detector = gensim.models.phrases.Phrases(lst_corpus, 
                  delimiter=" ".encode(), min_count=30)
  bigrams_detector = gensim.models.phrases.Phraser(bigrams_detector)
  trigrams_detector = gensim.models.phrases.Phrases(bigrams_detector[lst_corpus], 
              delimiter=" ".encode(), min_count=30)
  trigrams_detector = gensim.models.phrases.Phraser(trigrams_detector)

  nlp = Word2Vec(lst_corpus, size=300,   
            window=8, min_count=2, sg=1, iter=30, negative=5)
  return nlp, lst_corpus, bigrams_detector, trigrams_detector
def feature_engineering(lst_corpus, train_text):
  tokenizer = kprocessing.text.Tokenizer(lower=True, split=' ', 
                     oov_token="NaN", 
                     filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
  tokenizer.fit_on_texts(lst_corpus)
  dict_vocabulary = tokenizer.word_index
  ## create sequence
  lst_corpus = []
  for string in train_text:
    lst_words = string.split()
    lst_grams = [" ".join(lst_words[i:i+1]) 
                for i in range(0, len(lst_words), 1)]
    lst_corpus.append(lst_grams)
  lst_text2seq= tokenizer.texts_to_sequences(lst_corpus)
  ## padding sequence
  X_train = kprocessing.sequence.pad_sequences(lst_text2seq, 
                      maxlen=30, padding="post", truncating="post")
  return tokenizer, dict_vocabulary, X_train
def test_handling(test_text,bigrams_detector, trigrams_detector, tokenizer):
  ## create list of n-grams
  lst_corpus = []
  for string in test_text:
      lst_words = string.split()
      lst_grams = [" ".join(lst_words[i:i+1]) for i in range(0, 
                  len(lst_words), 1)]
      lst_corpus.append(lst_grams)
      
  ## detect common bigrams and trigrams using the fitted detectors
  lst_corpus = list(bigrams_detector[lst_corpus])
  lst_corpus = list(trigrams_detector[lst_corpus])
  ## text to sequence with the fitted tokenizer
  lst_text2seq = tokenizer.texts_to_sequences(lst_corpus)

  ## padding sequence
  X_test = kprocessing.sequence.pad_sequences(lst_text2seq, maxlen=30,
              padding="post", truncating="post")
  return X_test
def make_embedding_matrix(dic_vocabulary, nlp):
  ## start the matrix (length of vocabulary x vector size) with all 0s
  embeddings = np.zeros((len(dic_vocabulary)+1, 300))
  for word,idx in dic_vocabulary.items():
      ## update the row with vector
      try:
          embeddings[idx] =  nlp[word]
      ## if word not in model then skip and the row stays all 0s
      except:
          pass
  return embeddings
def rn_w2v_model(embeddings):
  ## input
  x_in = layers.Input(shape=(30,))
  ## embedding
  x = layers.Embedding(input_dim=embeddings.shape[0],  
                      output_dim=embeddings.shape[1], 
                      weights=[embeddings],
                      input_length=30)(x_in)
  ## 2 layers of bidirectional lstm
  x = layers.Bidirectional(layers.LSTM(units=30, dropout=0.2, 
                          return_sequences=True))(x)
  x = layers.Bidirectional(layers.LSTM(units=30, dropout=0.2))(x)
  ## final dense layers
  x = layers.Dense(64, activation='relu')(x)
  y_out = layers.Dense(1, activation='sigmoid')(x)
  ## compile
  model = models.Model(x_in, y_out)
  model.compile(loss='binary_crossentropy',
                optimizer='adam', metrics=[f1_metric])

  model.summary()
  return model
def w2v_train(model,X_train, y_train,  X_valid, y_valid):
  y_train = np.array(y_train)
  class_weights = class_weight.compute_class_weight(class_weight="balanced", classes = np.unique(data_train["label"]), y=np.array(data_train["label"]))
  print(class_weights)
  class_weights={
      0:class_weights[0],
      1:class_weights[1]
  }
  early_stopping = EarlyStopping(
      min_delta=0.01, # minimium amount of change to count as an improvement
      patience=5, # how many epochs to wait before stopping
      restore_best_weights=True,
  )
  #train with weights
  training = model.fit(x=X_train, y=y_train, batch_size=32, 
                      epochs=20, shuffle=True, verbose=1, callbacks=[early_stopping],
                      validation_data=(X_valid, y_valid), class_weight=class_weights)
  ## plot loss and accuracy
  metrics = [k for k in training.history.keys() if ("loss" not in k) and ("val" not in k)]
  fig, ax = plt.subplots(nrows=1, ncols=2, sharey=True)
  ax[0].set(title="Training")
  ax11 = ax[0].twinx()
  ax[0].plot(training.history['loss'], color='black')
  ax[0].set_xlabel('Epochs')
  ax[0].set_ylabel('Loss', color='black')
  for metric in metrics:
      ax11.plot(training.history[metric], label=metric)
  ax11.set_ylabel("Score", color='steelblue')
  ax11.legend()
  ax[1].set(title="Validation")
  ax22 = ax[1].twinx()
  ax[1].plot(training.history['val_loss'], color='black')
  ax[1].set_xlabel('Epochs')
  ax[1].set_ylabel('Loss', color='black')
  for metric in metrics:
      ax22.plot(training.history['val_'+metric], label=metric)
  ax22.set_ylabel("Score", color="steelblue")
  plt.show()
  return model
def w2v_test(model, X_test,y_test):
  predicted_prob = model.predict(X_test)
  opt_threshold = 0.5
  #opt_threshold = roc_curve_threshold(predicted_prob, y_test)
  #opt_threshold = pr_curve_threshold(predicted_prob, y_test)
  predicted = [threshold(pred, opt_threshold) for pred in 
              predicted_prob]
  print(metrics.classification_report(y_test, predicted, digits=4))

In [None]:
train_text = get_vocab(data_train["text"])
test_text = get_vocab(data_test["text"])
valid_text = get_vocab(data_validation["text"])

In [None]:
nlp, lst_corpus, bigrams_detector, trigrams_detector = w2v_embedding(train_text)
tokenizer, dic_vocabulary, X_train = feature_engineering(lst_corpus, train_text)
X_test = test_handling(test_text,bigrams_detector, trigrams_detector, tokenizer)
x_valid = test_handling(valid_text,bigrams_detector, trigrams_detector, tokenizer)

In [None]:
embeddings=make_embedding_matrix(dic_vocabulary, nlp)
w2v_model = rn_w2v_model(embeddings)
w2v_model_trained = w2v_train(w2v_model, X_train, data_train["label"], x_valid, data_validation["label"])

In [None]:
w2v_test(w2v_model_trained, X_test,data_test["label"])

In [None]:
evaluate_model(X_test, x_valid, w2v_model_trained)

In [None]:
save_model(w2v_model_trained, "word2vec_balanced_embed")

### BERT (Sentence Transformers)

In [None]:
from sentence_transformers import SentenceTransformer
from keras.models import Sequential
from keras.initializers import GlorotNormal
from keras.layers import Dense, Dropout, Bidirectional, LSTM, Embedding, Input, BatchNormalization
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from keras import models
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from keras.callbacks import LearningRateScheduler
import tensorflow as tf
from sklearn import model_selection, naive_bayes, svm
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import class_weight
import pickle

In [None]:
def sentence_transformers_embeddings(texts):
  model = SentenceTransformer("all-mpnet-base-v2")
  embeddings = model.encode(texts)

  return embeddings

#### RN

In [None]:
def rn_bert_sent_transf_model(embeddings, y_train):
  input_size = len(embeddings[0])
  model = Sequential()
  initializer = GlorotNormal()
 
  model.add(Dense(96, input_shape=(input_size, ), activation="relu", kernel_initializer=initializer))
  model.add(Dropout(0.5))
  model.add(Dense(1, activation='sigmoid', kernel_initializer=initializer))
  opt = tf.keras.optimizers.Adam(learning_rate=2e-5)
  model.compile(loss='binary_crossentropy',
                optimizer=opt, metrics=[f1_metric])
  model.summary()
  return model

In [None]:
def bert_sent_transf_train(model, y_train, X_train, X_valid, y_valid):
  early_stopping = EarlyStopping(
      min_delta=0.001, # minimium amount of change to count as an improvement
      patience=5, # how many epochs to wait before stopping
      restore_best_weights=True,
  )
  # class_weights = class_weight.compute_class_weight(class_weight="balanced", classes = np.unique(y_train), y=np.array(y_train))
  # class_weights={
  #     0:class_weights[0],
  #     1:class_weights[1]
  # }
  ## train
  training = model.fit(x=X_train, y=y_train, batch_size=32, 
                      epochs=30, shuffle=True, verbose=1, callbacks=[early_stopping],
                      validation_data=(X_valid, y_valid))#, class_weight=class_weights)
  ## plot loss and accuracy
  metrics = [k for k in training.history.keys() if ("loss" not in k) and ("val" not in k)]
  fig, ax = plt.subplots(nrows=1, ncols=2, sharey=True)
  ax[0].set(title="Training")
  ax11 = ax[0].twinx()
  ax[0].plot(training.history['loss'], color='black')
  ax[0].set_xlabel('Epochs')
  ax[0].set_ylabel('Loss', color='black')
  for metric in metrics:
      ax11.plot(training.history[metric], label=metric)
  ax11.set_ylabel("Score", color='steelblue')
  ax11.legend()
  ax[1].set(title="Validation")
  ax22 = ax[1].twinx()
  ax[1].plot(training.history['val_loss'], color='black')
  ax[1].set_xlabel('Epochs')
  ax[1].set_ylabel('Loss', color='black')
  for metric in metrics:
      ax22.plot(training.history['val_'+metric], label=metric)
  ax22.set_ylabel("Score", color="steelblue")
  plt.show()
  return model

In [None]:
def bert_sent_transf_test(model, X_test, y_test, X_valid, y_valid):
  predicted_prob = model.predict(X_test)
  opt_threshold = 0.5
  #opt_threshold = pr_curve_threshold(model.predict(X_valid), y_valid)
  predicted = [threshold(pred, opt_threshold) for pred in 
              predicted_prob]
  accuracy = metrics.accuracy_score(y_test, predicted)
  print("Accuracy:",  round(accuracy,2))
  print(metrics.classification_report(y_test, predicted, digits=4))

In [None]:
text_train = get_vocab_bert(data_train["text"])
text_test = get_vocab_bert(data_test["text"])
text_valid = get_vocab_bert(data_validation["text"])

In [None]:
embeddings = sentence_transformers_embeddings(text_train)

In [None]:
text_train = under_random_train_text_bert
embeddings_labels = under_random_train["label"]
embeddings = sentence_transformers_embeddings(text_train)

In [None]:
embeddings, embeddings_labels  = smote_over_undersampling(embeddings, data_train["label"])

In [None]:
bert_sentence_transformer_model = rn_bert_sent_transf_model(embeddings, embeddings_labels)

In [None]:
embeddings_valid = sentence_transformers_embeddings(text_valid)

In [None]:
trained_bert_sent_transf_model = bert_sent_transf_train(bert_sentence_transformer_model, embeddings_labels, embeddings, embeddings_valid, data_validation["label"])

In [None]:
embeddings_test = sentence_transformers_embeddings(text_test)

In [None]:
bert_sent_transf_test(trained_bert_sent_transf_model, embeddings_test, test_df["label"], embeddings_valid, data_validation["label"])

In [None]:
save_model(trained_bert_sent_transf_model, "sentence_transf_smote_under")

In [None]:
trained_bert_sent_transf_model.save(ROOT_DIR+'/models/bert_rn_pr')

In [None]:
new_model = tf.keras.models.load_model(ROOT_DIR+'/models/bert_rn_pr', custom_objects={'f1_metric':f1_metric})
new_model.summary()

In [None]:
bert_sent_transf_test(new_model, embeddings_test, test_df["label"], embeddings_valid, data_validation["label"])

### BERT Pretrained Models

In [None]:
import pickle
from transformers import AutoModel, AdamW, AutoTokenizer, get_scheduler
from torch.utils.data import DataLoader, TensorDataset, Dataset, RandomSampler
import torch
from sklearn.utils import class_weight
import numpy as np
import torch.nn as nn
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import torch.nn.functional as F
from sklearn import metrics
from sklearn.metrics import f1_score

In [3]:
#change model name for all the models you want to fine-tune
embedding_config = {
    "model_name": "roberta-large",
    "path":ROOT_DIR+"/models/roberta-large-new",
    "max_length":40,
    "batch_size":16,
    "source":"HuggingFace"
}

In [None]:
model_config = {
    "learning_rate" : 2e-5,
    "weight_decay":0.01,
    "epochs":20
}

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
class_weights = class_weight.compute_class_weight(class_weight="balanced", classes = np.unique(data_train["label"]), y=np.array(data_train["label"]))

In [None]:
class BertClassfierPytorch(nn.Module):
  def __init__(self, input_size = 768, output_size = 1):
    super().__init__()
    print(input_size)
    self.name = embedding_config["model_name"]
    self.tokenizer = AutoTokenizer.from_pretrained(embedding_config["model_name"], use_fast=False)
    self.embedder=AutoModel.from_pretrained(embedding_config["model_name"])
    self.dropout=nn.Dropout(0.5)
    self.classifier=nn.Linear(input_size, output_size)
    nn.init.xavier_uniform_(self.classifier.weight)

In [None]:
def normalize(predicted):
  opt_threshold=0.5
  predicted = [threshold(pred, opt_threshold) for pred in 
              predicted]
  return predicted

def evaluate_predictions(predictions, labels):
  metrics.confusion_matrix(labels, predictions)
  print(metrics.classification_report(labels, predictions, digits=4))
  return f1_score(labels, predictions)

In [None]:
def evaluate(model, x_test, y_test):
  x_test = model.tokenizer(x_test, padding  = "max_length", max_length = embedding_config["max_length"], truncation = True, return_tensors = "pt")
  test_data = TensorDataset(x_test["input_ids"], x_test["attention_mask"], torch.FloatTensor(y_test))

  batch_size = embedding_config["batch_size"]
  test_dataloader = DataLoader(test_data, batch_size=batch_size)

  preds=[]
  model.eval()
  for batch in test_dataloader:
    aux_batch = {
        "input_ids" : batch[0].to(device),
        "attention_mask" : batch[1].to(device)
    }

    with torch.no_grad():
      outputs = model.embedder(**aux_batch).pooler_output
      outputs = model.classifier(model.dropout(outputs))

    preds+=outputs.sigmoid().round().reshape(-1).tolist()
  
  predictions = normalize(preds)
  return evaluate_predictions(predictions, y_test)

In [None]:
def train_model(model, x_train, y_train, x_valid, y_valid):
  max_score = 0
  
  x_train = model.tokenizer(x_train, padding  = "max_length", max_length = embedding_config["max_length"], truncation = True, return_tensors = "pt")
  train_data = TensorDataset(x_train["input_ids"], x_train["attention_mask"], torch.FloatTensor(y_train))

  batch_size = embedding_config["batch_size"]
  train_dataloader = DataLoader(train_data, batch_size=batch_size)

  num_epochs = model_config["epochs"]
  num_training_steps = num_epochs * len(train_dataloader)

  optimizer = AdamW(model.parameters(), lr = model_config["learning_rate"], weight_decay = model_config["weight_decay"])

  lr_scheduler = get_scheduler(
      "linear",
      optimizer = optimizer,
      num_warmup_steps = 0.2*num_training_steps,
      num_training_steps = num_training_steps
  )

  print("Steps ",num_training_steps)

  progress_bar = tqdm(range(num_training_steps))

  model.train()
  losses = []

  for epoch in range(num_epochs):
    model.train()
    print("Epoch: ", epoch+1)
    for batch in train_dataloader:
      aux_batch = {
        "input_ids" : batch[0].to(device),
        "attention_mask" : batch[1].to(device)
      }

      outputs = model.embedder(**aux_batch).pooler_output
      outputs = model.classifier(model.dropout(outputs))

      criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([5.2]).to(device))
      loss = criterion(outputs, batch[2].to(device).reshape(-1,1))

      loss.backward()
      optimizer.step()
      optimizer.zero_grad()
      lr_scheduler.step()
      progress_bar.update(1)

    losses.append(loss.tolist())
    score =  evaluate(model, x_valid, y_valid)

    if score>max_score:
      max_score = score
      torch.save(model.state_dict(), embedding_config["path"]+".pt")  
  return num_training_steps, losses


In [None]:
text_train = get_vocab_bert(data_train["text"])
text_test = get_vocab_bert(data_test["text"])
text_valid = get_vocab_bert(data_validation["text"])

In [None]:
model = BertClassfierPytorch(input_size=1024)
model.to(device)

In [None]:
num_training_steps, losses = train_model(model, text_train, data_train["label"], text_valid, data_validation["label"])

In [None]:
plt.plot(range(model_config["epochs"]), losses)
plt.ylabel("loss")
plt.xlabel("step")

In [None]:
load_model = BertClassfierPytorch(input_size=1024)

In [None]:
path = join(ROOT_DIR, 'models', "roberta-large-new.pt")
load_model.load_state_dict(torch.load(path))

In [None]:
load_model.to(device)
print(evaluate(load_model, text_test, data_test["label"]))

### Ensemble Bert

In [None]:
import pickle
from transformers import AutoModel, AdamW, AutoTokenizer, get_scheduler
from torch.utils.data import DataLoader, TensorDataset, Dataset, RandomSampler
import torch
from sklearn.utils import class_weight
import numpy as np
import torch.nn as nn
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import torch.nn.functional as F
from sklearn import metrics
from sklearn.metrics import f1_score

In [None]:
embedding_config = {
    "model_name": "bert-base-uncased",
    "path":ROOT_DIR+"/models/bert-base-uncased",
    "max_length":40,
    "batch_size":16,
    "source":"HuggingFace"
}

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
text_train = get_vocab_bert(data_train["text"])
text_test = get_vocab_bert(data_test["text"])
text_valid = get_vocab_bert(data_validation["text"])

In [None]:
class BertPytorch(nn.Module):
  def __init__(self,name="vinai/bertweet-large", input_size = 1024, output_size = 1):
    embedding_config = {
      "model_name": name,
      "max_length":40,
      "batch_size":16,
      "source":"HuggingFace"
    }

    self.embedding_config = embedding_config

    model_config = {
        "learning_rate" : 0.01,
        "weight_decay":0.01,
        "epochs":10
    }
    self.model_config = model_config
    super().__init__()
    self.name = embedding_config["model_name"]
    self.tokenizer = AutoTokenizer.from_pretrained(embedding_config["model_name"], use_fast=False)
    self.embedder=AutoModel.from_pretrained(embedding_config["model_name"])
    self.dropout=nn.Dropout(0.5)
    self.classifier=nn.Linear(input_size, output_size)
  
  def forward(self, x):
    x_test = self.tokenizer(x, padding  = "max_length", max_length = embedding_config["max_length"], 
                             truncation = True, return_tensors = "pt")
    test_data = TensorDataset(x_test["input_ids"], x_test["attention_mask"])

    batch_size = embedding_config["batch_size"]
    test_dataloader = DataLoader(test_data, batch_size=batch_size)

    preds=[]
    for batch in test_dataloader:
      aux_batch = {
          "input_ids" : batch[0].to(device),
          "attention_mask" : batch[1].to(device)
      }

      with torch.no_grad():
        outputs = self.embedder(**aux_batch).pooler_output
        outputs = self.classifier(self.dropout(outputs))

      preds+=outputs.reshape(-1).tolist()
    
    return preds


In [None]:
class EnsembleModel(nn.Module):
  def __init__(self, models_count):
      super(EnsembleModel, self).__init__()
      self.models_count=models_count
      self.linear=nn.Linear(models_count, 128)
      self.drop = nn.Dropout(0.2)
      self.classifier=nn.Linear(128, 1)

  
  def forward(self, x):
      x=self.linear(x)  
      x=self.drop(x)
      x=self.classifier(x)
      return x


In [None]:
config = {
    "model_name":"ensemble-model-new",
    "max_length":40,
    "batch_size":8,
    "learning_rate":0.0001,
    "epsilon": 1e-8,
    "weight_decay": 1e-02,
    "epochs":30
}

In [None]:
def normalize(predicted):
  opt_threshold=0
  predicted = [threshold(pred, opt_threshold) for pred in 
              predicted]
  return predicted

def evaluate_predictions(predictions, labels):
  metrics.confusion_matrix(labels, predictions)
  print(metrics.classification_report(labels, predictions, digits=4))
  return f1_score(labels, predictions)

def normalize_threshold(predicted, y_test):
  opt_threshold = pr_curve_threshold(predicted, y_test)
  predicted = [threshold(pred, opt_threshold) for pred in 
              predicted]
  return predicted

In [None]:
class TrainEnsembleModel():
  def __init__(self, config): 
    self.config = config
    self.models = [
          ("cardiffnlp/twitter-roberta-base-sentiment", 768),
          ("cardiffnlp/twitter-roberta-base-hate", 768),
          ("cardiffnlp/twitter-roberta-base-emotion", 768),
          ("cardiffnlp/twitter-roberta-base-offensive", 768),
          ("cardiffnlp/twitter-roberta-base-irony", 768),
          ("cardiffnlp/twitter-roberta-base", 768),
          ("vinai/bertweet-large", 1024),
          ("ningkko/drug-stance-bert", 768),
          ("ml4pubmed/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext_pub_section", 768),
          ("dmis-lab/biobert-large-cased-v1.1-mnli",  1024),
          ("roberta-large", 1024),
          ("bert-base-uncased",768)
        ]

  def get_models_out(self, x,x_label=None, x_test=None, y_test=None):
      outs=[]
      i=0
      for model_name, model_dim in self.models:
        index=1
        if i==10 or i==11:
            index=0
        model_path = ROOT_DIR+"/models/"+model_name.split("/",1)[index]+".pt"
        new_model = BertPytorch(name=model_name, input_size = model_dim)
        new_model.load_state_dict(torch.load(model_path, map_location = device))
        new_model.to(device)
        new_model.eval()
        preds=new_model.forward(x)
        outs.append(preds)
        i+=1
      outs=np.array(outs).transpose()
      return outs

  def train(self, y_train, x_valid, y_valid):
      model = EnsembleModel(len(self.models))
      model.to(device)
      train_data = TensorDataset(torch.FloatTensor(x_train_ensemble), torch.FloatTensor(y_train))

      batch_size = self.config["batch_size"]
      train_dataloader = DataLoader(train_data,shuffle=True, batch_size=batch_size)

      num_epochs = self.config["epochs"]
      num_training_steps = num_epochs * len(train_dataloader)

      optimizer = AdamW(model.parameters(), lr = self.config["learning_rate"], 
                        weight_decay = self.config["weight_decay"])

      lr_scheduler = get_scheduler(
          "linear",
          optimizer = optimizer,
          num_warmup_steps = 0.2*num_training_steps,
          num_training_steps = num_training_steps
      )

      progress_bar = tqdm(range(num_training_steps))
      max_score = 0 

      model.train()

      for epoch in range(num_epochs):
        model.train()
        print("Epoch: ", epoch+1)
        for batch in train_dataloader:
          aux_batch = {
            "x_train" : batch[0].to(device),
            "label" : batch[1].to(device)
          }

          outputs = model.forward(aux_batch["x_train"])
          criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([7.1]).to(device))
          loss = criterion(outputs, aux_batch["label"].reshape(-1,1))

          loss.backward()
          optimizer.step()
          optimizer.zero_grad()
          lr_scheduler.step()
          progress_bar.update(1)

        score =  self.evaluate(model, x_valid_ensemble, y_valid)

        import copy
        if score>max_score:
          max_score = score
          best_model = copy.deepcopy(model)  
      return max_score,best_model

  def evaluate(self, model, x_test, y_test):
      preds = self.get_predictions(x_test, model)
      print(metrics.classification_report(y_test, preds, digits=4))
      return f1_score(y_test, preds)

  def get_predictions(self, x_test, model):
      test_data = TensorDataset(torch.FloatTensor(x_test))


      batch_size = self.config["batch_size"]
      test_dataloader = DataLoader(test_data, batch_size=batch_size)

      preds=[]
      model.eval()
      for batch in test_dataloader:
        aux_batch = {
            "x_test" : batch[0].to(device)
        }

        with torch.no_grad():
          outputs = model.forward(aux_batch["x_test"])

        preds+=outputs.sigmoid().round().reshape(-1).tolist()
      
      
      return preds


In [None]:
trainer = TrainEnsembleModel(config)

In [None]:
x_train_ensemble = trainer.get_models_out(text_train, data_train["label"])

In [None]:
x_test_ensemble = trainer.get_models_out(text_test, data_test["label"])

In [None]:
x_valid_ensemble = trainer.get_models_out(text_valid, data_validation["label"])

In [None]:
print(x_test_ensemble[0])

In [None]:
f = open("x_train_ensemble.txt", "w")
for el in x_train_ensemble:
  for i in el:
    f.write(str(i)+" ")
  f.write("\n")
f.close()

In [None]:
trainer = TrainEnsembleModel(config)
max_score, best_model = trainer.train(data_train["label"], text_valid, data_validation["label"])

In [None]:
torch.save(best_model.state_dict(), ROOT_DIR+"/models/ensemble-model.pt")

In [None]:
print(trainer.evaluate(best_model, x_test_ensemble, data_test["label"]))

#### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=100)
clf.fit(x_train_ensemble, data_train["label"])

In [None]:
preds = clf.predict_proba(x_test_ensemble)
probs = clf.predict_proba(x_valid_ensemble)
opt_threshold = pr_curve_threshold(probs[:, 1], data_validation["label"])
predicted = [threshold(pred, opt_threshold) for pred in 
              preds[:, 1]]
print(metrics.classification_report(data_test["label"], predicted, digits=4))

In [None]:
preds = clf.predict(x_test_ensemble)
print(metrics.classification_report(data_test["label"], preds, digits=4))

In [None]:
save(clf, "ada_boost_ensemble")

#### Logistic Regression

In [None]:
lr=logistic_regression_classif(x_train_ensemble, x_valid_ensemble, data_train["label"], data_validation["label"], "logistic_regression_ensemble",'balanced')

In [None]:
preds = lr.predict_proba(x_test_ensemble)
probs = lr.predict_proba(x_valid_ensemble)
opt_threshold = pr_curve_threshold(probs[:, 1], data_validation["label"])
predicted = [threshold(pred, opt_threshold) for pred in 
              preds[:, 1]]
print(metrics.classification_report(data_test["label"], predicted, digits=4))

#### KNN

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x_train_ensemble)

X_train = scaler.transform(x_train_ensemble)
X_test = scaler.transform(x_test_ensemble)
X_valid = scaler.transform(x_valid_ensemble)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, data_train["label"])

In [None]:
preds = classifier.predict(X_test)
print(metrics.classification_report(data_test["label"], preds, digits=4))

In [None]:
save(classifier, "knn_ensemble")

### Plotting

#### Data Distribution

In [None]:
import matplotlib.pyplot as plt

In [None]:
def tweet_distribution(count_adr, count_noadr, index_names, fig_name):
  print(count_adr)
  print(count_noadr)
  plotdata = pd.DataFrame({
    "ADR":count_adr,
    "no ADR":count_noadr},
    index=index_names)

  plotdata.plot(kind="bar",figsize=(15, 8))
  plt.title("Tweets distribution")
  plt.xlabel("Set")
  plt.ylabel("Tweets")
  plt.savefig(fig_name)

In [None]:
train_labels = data_train["label"].tolist()
test_labels = data_test["label"].tolist()
valid_labels = data_validation["label"].tolist()

tweet_distribution([train_labels.count(1),test_labels.count(1),valid_labels.count(1)], [train_labels.count(0),test_labels.count(0),valid_labels.count(0)],["Train", "Test", "Validation"],"dataset1_distribution.png")


#### WordCloud

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [None]:
def make_word_cloud(data, fig_name, typ):
  examples = data[data['label']==typ]
  text = get_vocab(data["text"])
  text = " ".join(text)
  word_cloud = WordCloud(collocations = False, background_color = 'white').generate(text)
  plt.imshow(word_cloud, interpolation='bilinear')
  plt.axis("off")
  plt.show()
  plt.savefig(fig_name)

In [None]:
data_train["text"] = get_vocab(data_train["text"])
make_word_cloud(data_train, "positive_wordcloud.png",1)