<a href="https://colab.research.google.com/github/vsoos/CloudComputing/blob/main/ex3/3_notebook3_CPU/3_3_cpu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Combining models
**Feedback tone + Sarcasm detection**

In [None]:
!pip install tensorflow==2.9.1 transformers==4.20.1

## Sentiment Analysis

In [None]:
import os
import re
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm # Progress Bar
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import transformers
from transformers import DistilBertTokenizer, TFDistilBertModel, DistilBertConfig
import tensorflow as tf
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
import warnings
from transformers import logging as hf_logging
hf_logging.set_verbosity_error() # Hidding Huggingface Warnings
warnings.filterwarnings("ignore")

In [None]:
sentiment_labels = ['Negative', 'Neutral', 'Positive']
sarcasm_labels = ['Normal', 'Sarcastic']

In [None]:
# paths to model files (h5), weights
sentiment_model_file = "/content/drive/MyDrive/cloudcomputing2023_VincenzinaSoos/ex3/3_notebook1_GPU/data/sentiment_model.h5"
sarcasm_model_file = "/content/drive/MyDrive/cloudcomputing2023_VincenzinaSoos/ex3/3_notebook2_GPU/sarcasm_model.h5"

In [None]:
MAX_LENGTH = 235
MODEL_NAME = 'distilbert-base-cased'

In [None]:
config = DistilBertConfig.from_pretrained(MODEL_NAME, output_hidden_states=True, output_attentions=True, return_dict=True)
DistilBERT = TFDistilBertModel.from_pretrained(MODEL_NAME, config=config)

input_ids_in = tf.keras.layers.Input(shape=(MAX_LENGTH,), name='input_token', dtype='int32')
attention_mask_in = tf.keras.layers.Input(shape=(MAX_LENGTH,), name='masked_token', dtype='int32')

embedding_layer = DistilBERT(input_ids=input_ids_in, attention_mask=attention_mask_in)[0]
X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))(embedding_layer)
X = tf.keras.layers.GlobalMaxPool1D()(X)
X = tf.keras.layers.Dense(64, activation='relu')(X)
X = tf.keras.layers.Dropout(0.2)(X)
X = tf.keras.layers.Dense(3, activation='softmax')(X)

sentiment_model = tf.keras.Model(inputs=[input_ids_in, attention_mask_in], outputs=X)

for layer in sentiment_model.layers[:3]:
    layer.trainable = False

sentiment_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_token (InputLayer)       [(None, 235)]        0           []                               
                                                                                                  
 masked_token (InputLayer)      [(None, 235)]        0           []                               
                                                                                                  
 tf_distil_bert_model_1 (TFDist  TFBaseModelOutput(l  65190912   ['input_token[0][0]',            
 ilBertModel)                   ast_hidden_state=(N               'masked_token[0][0]']           
                                one, 235, 768),                                                   
                                 hidden_states=((No                                           

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME,
                                                add_special_tokens=True,
                                                max_length=MAX_LENGTH,
                                                pad_to_max_length=True)

def tokenize(sentences, tokenizer):
    input_ids, input_masks, input_segments = [], [], []
    for sentence in tqdm(sentences):
        inputs = tokenizer.encode_plus(sentence,
                                       add_special_tokens=True,
                                       max_length=MAX_LENGTH,
                                       pad_to_max_length=True,
                                       return_attention_mask=True,
                                       return_token_type_ids=True,
                                       truncation=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])

    return np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32')

In [None]:
# load the weights from the h5 -file trained previously
sentiment_model.load_weights(sentiment_model_file)

## Sarcasm detection model

In [None]:
mispell_dict = {"ain't": "is not", "cannot": "can not", "aren't": "are not", "can't": "can not", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",
                "doesn't": "does not",
                "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'll": "he will", "he's": "he is", "how'd": "how did",
                "how'd'y": "how do you", "how'll": "how will", "how's": "how is", "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have", "I'm": "I am",
                "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have", "i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
                "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have", "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have",
                "mightn't": "might not", "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not",
                "needn't've": "need not have", "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not",
                "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
                "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have", "so's": "so as", "this's": "this is", "that'd": "that would",
                "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is", "they'd": "they would",
                "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not",
                "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not",
                "what'll": "what will", "what'll've": "what will have", "what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have",
                "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
                "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "wont": "will not", "won't've": "will not have", "would've": "would have",
                "wouldn't": "would not",
                "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would", "y'all'd've": "you all would have", "y'all're": "you all are", "y'all've": "you all have",
                "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have", 'colour': 'color',
                'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor',
                'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What',
                'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I',
                'theBest': 'the best', 'howdoes': 'how does', 'Etherium': 'Ethereum',
                'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what',
                'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization'}

mispell_dict = {k.lower(): v.lower() for k, v in mispell_dict.items()}

In [None]:
def preprocessing_text(s):
    # making our string lowercase & removing extra spaces
    s = str(s).lower().strip()

    # remove contractions.
    s = " ".join([mispell_dict[word] if word in mispell_dict.keys() else word for word in s.split()])

    # removing \n
    s = re.sub('\n', '', s)

    # put spaces before & after punctuations to make words seprate. Like "king?" to "king", "?".
    s = re.sub(r"([?!,+=—&%\'\";:¿।।।|\(\){}\[\]//])", r" \1 ", s)

    # Remove more than 2 continues spaces with 1 space.
    s = re.sub('[ ]{2,}', ' ', s).strip()

    return s

In [None]:
sarcasm_data = pd.read_csv("/content/drive/MyDrive/cloudcomputing2023_VincenzinaSoos/ex3/3_notebook2_GPU/data/train-balanced-sarcasm.csv")

sarcasm_data.drop(['author', 'subreddit', 'score', 'ups', 'downs', 'date', 'created_utc', 'parent_comment'], axis=1, inplace=True)
# remove empty rows
sarcasm_data.dropna(inplace=True)
sarcasm_data.head()

# let's take a small piece for testing purposes for error metrics
# ~~ 10% would be 100000 rows.
sarcasm_data = sarcasm_data.iloc[0:950000]
sarcasm_data['comment'] = sarcasm_data['comment'].apply(preprocessing_text)

In [None]:
# total unique words we are going to use.
TOTAL_WORDS = 40000

# max number of words one sentence can have
MAX_LEN = 50

# width of of 1D embedding vector
EMBEDDING_SIZE = 300

In [None]:
# keras for deep learning model creation
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Flatten, Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras.utils import plot_model

In [None]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)
print("Keras version in TensorFlow:", tf.keras.__version__)

TensorFlow version: 2.9.1
Keras version in TensorFlow: 2.9.0


In [None]:
%%time
tokenizer2 = Tokenizer(num_words=TOTAL_WORDS)
tokenizer2.fit_on_texts(list(sarcasm_data['comment']))

train_data = tokenizer2.texts_to_sequences(sarcasm_data['comment'])
train_data = pad_sequences(train_data, maxlen = MAX_LEN)
target = sarcasm_data['label']

CPU times: user 36.6 s, sys: 228 ms, total: 36.8 s
Wall time: 37.2 s


In [None]:
%%time
EMBEDDING_FILE = "/content/drive/MyDrive/cloudcomputing2023_VincenzinaSoos/ex3/3_notebook2_GPU/crawl-300d-2M.vec"

def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in tqdm(open(EMBEDDING_FILE)))

# remember ot use tokenizer2
word_index = tokenizer2.word_index
nb_words = min(TOTAL_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words, EMBEDDING_SIZE))

1999996it [04:13, 7875.02it/s] 

CPU times: user 2min 55s, sys: 9.12 s, total: 3min 4s
Wall time: 4min 13s





In [None]:
# filter out a smaller portion of the word embeddings
for word, i in tqdm(word_index.items()):
    if i >= TOTAL_WORDS: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

100%|██████████| 161878/161878 [00:00<00:00, 306490.04it/s]


In [None]:
input_layer = Input(shape=(MAX_LEN,))

embedding_layer = Embedding(TOTAL_WORDS, EMBEDDING_SIZE, weights = [embedding_matrix])(input_layer)

LSTM_layer = Bidirectional(LSTM(128, return_sequences = True))(embedding_layer)
maxpool_layer = GlobalMaxPool1D()(LSTM_layer)

dense_layer_1 = Dense(64, activation="relu")(maxpool_layer)
dropout_1 = Dropout(0.5)(dense_layer_1)

dense_layer_2 = Dense(32, activation="relu")(dropout_1)
dropout_2 = Dropout(0.5)(dense_layer_2)

output_layer = Dense(1, activation="sigmoid")(dropout_2)

# the original code uses the old way of Tensorflow
# for input and output layer, just remove the keywords
# input= and output= and it should work
sarcasm_model = Model(input_layer, output_layer)

sarcasm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
sarcasm_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 50)]              0         
                                                                 
 embedding (Embedding)       (None, 50, 300)           12000000  
                                                                 
 bidirectional_1 (Bidirectio  (None, 50, 256)          439296    
 nal)                                                            
                                                                 
 global_max_pooling1d_1 (Glo  (None, 256)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_2 (Dense)             (None, 64)                16448     
                                                                 
 dropout_39 (Dropout)        (None, 64)                0   

In [None]:
sarcasm_model.load_weights(sarcasm_model_file)

## Testing the combined model

In [None]:
# convert predictions to decimal format for easier reading (instead of scientific format)
np.set_printoptions(formatter={'float_kind':'{:f}'.format})

In [None]:
# our test data
test_list = ["Excellent breakfast and room service. Highly recommended!",
             "Rude staff and the rooms were not very clean. Also very expensive. Never again!",
             "Okay hotel for a good price, nothing special.",
             "I just totally love it when the rooms are dirty and the AC is not working.",
             "Totally worth the price! Said no one ever. UGH"]

# process the data for the first model
test_values_sentiment_model = tokenize(test_list, tokenizer)
test_probs = sentiment_model.predict(test_values_sentiment_model)
counter = 0

100%|██████████| 5/5 [00:00<00:00, 1057.67it/s]




In [None]:
for test_prob in test_probs:
  print(test_list[counter])
  print(test_prob)
  tone_index = np.argmax(test_prob)
  print(sentiment_labels[tone_index])

  # get the original sentence and preprocess it for the MODEL 2
  sentence = preprocessing_text(test_list[counter])
  sentence = tokenizer2.texts_to_sequences([sentence])
  sentence = pad_sequences(sentence, maxlen = MAX_LEN)

  # get sarcasm % from MODEL 2
  sarcasm_prediction = sarcasm_model.predict(sentence)
  sarcasm_prediction_value = sarcasm_prediction[0][0]

  print(f"Sarcasm%: {round(sarcasm_prediction_value * 100, 1)} %")

  print("---------------------------------")
  counter += 1

Excellent breakfast and room service. Highly recommended!
[0.000013 0.000489 0.999498]
Positive
Sarcasm%: 38.9 %
---------------------------------
Rude staff and the rooms were not very clean. Also very expensive. Never again!
[0.836917 0.117928 0.045154]
Negative
Sarcasm%: 53.6 %
---------------------------------
Okay hotel for a good price, nothing special.
[0.490344 0.359617 0.150039]
Negative
Sarcasm%: 25.0 %
---------------------------------
I just totally love it when the rooms are dirty and the AC is not working.
[0.873319 0.092828 0.033853]
Negative
Sarcasm%: 77.7 %
---------------------------------
Totally worth the price! Said no one ever. UGH
[0.053020 0.021234 0.925746]
Positive
Sarcasm%: 99.3 %
---------------------------------


In [None]:
# Predict and evaluate each feedback
results = []
for feedback in test_list:
    # sentiment model
    processed_feedback = tokenize([feedback], tokenizer)
    sentiment_probs = sentiment_model.predict(processed_feedback)
    sentiment_index = np.argmax(sentiment_probs[0])
    sentiment = sentiment_labels[sentiment_index]

    # sarcasm model
    preprocessed_feedback = preprocessing_text(feedback)
    preprocessed_feedback = tokenizer2.texts_to_sequences([preprocessed_feedback])
    preprocessed_feedback = pad_sequences(preprocessed_feedback, maxlen=MAX_LEN)
    sarcasm_prob = sarcasm_model.predict(preprocessed_feedback)[0][0]
    sarcasm = "Yes" if sarcasm_prob > 0.5 else "No"

    # Evaluate based on provided logic
    outcome = "Undefined outcome"
    if sentiment == "Positive":
        outcome = "Negative feedback" if sarcasm == "Yes" else "Positive feedback"
    elif sentiment == "Neutral":
        outcome = "Neutral feedback"
    elif sentiment == "Negative":
        outcome = "Positive feedback?" if sarcasm == "Yes" else "Negative feedback"

    results.append((feedback, sentiment, sarcasm, round(sarcasm_prob * 100, 1), outcome))

# Print results
for feedback, tone, sarcasm, sarcasm_percent, outcome in results:
    print(f"Feedback: {feedback}\nSentiment: {sentiment}\nSarcasm: {sarcasm} ({sarcasm_percent}%)\nOutcome: {outcome}\n---------------------------------")

100%|██████████| 1/1 [00:00<00:00, 680.78it/s]




100%|██████████| 1/1 [00:00<00:00, 986.66it/s]




100%|██████████| 1/1 [00:00<00:00, 1205.61it/s]




100%|██████████| 1/1 [00:00<00:00, 1582.76it/s]




100%|██████████| 1/1 [00:00<00:00, 1153.23it/s]


Feedback: Excellent breakfast and room service. Highly recommended!
Sentiment: Positive
Sarcasm: No (38.9%)
Outcome: Positive feedback
---------------------------------
Feedback: Rude staff and the rooms were not very clean. Also very expensive. Never again!
Sentiment: Positive
Sarcasm: Yes (53.6%)
Outcome: Positive feedback?
---------------------------------
Feedback: Okay hotel for a good price, nothing special.
Sentiment: Positive
Sarcasm: No (25.0%)
Outcome: Negative feedback
---------------------------------
Feedback: I just totally love it when the rooms are dirty and the AC is not working.
Sentiment: Positive
Sarcasm: Yes (77.7%)
Outcome: Positive feedback?
---------------------------------
Feedback: Totally worth the price! Said no one ever. UGH
Sentiment: Positive
Sarcasm: Yes (99.3%)
Outcome: Negative feedback
---------------------------------
