In [14]:
import re
import pickle
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [15]:
# Function to pre-process text
def preprocess(phrase): 
  
    phrase = phrase.lower()   
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub('[^\w\s]','', phrase).strip()

    return phrase

In [16]:
# loading tokenizer object
with open('/content/drive/MyDrive/Case_Study_2/tokenizer.pkl', 'rb') as f:
    t = pickle.load(f)

In [17]:
# loading best model
model = load_model('/content/drive/MyDrive/Case_Study_2/bi_lstm_model1.hdf5')

In [18]:
def func1(s):
    '''This function takes a comment(string) as input and 
       returns whether the comment is sarcastic or not as output'''
    
    # Convert input string to list
    inp_str = [preprocess(s)]

    # Tokenize input string
    encoded_str = t.texts_to_sequences(inp_str)

    # Padding input sequence to have length of 30
    padded_str = pad_sequences(encoded_str, maxlen=30, dtype='int32', 
                               padding='post', truncating='post', value=0.0)
    
    # prediction on padded input sequence
    pred = model.predict(padded_str).flatten()[0]
    pred_int = np.where(pred >= 0.5, 1, 0).flatten()[0]

    # Output string
    if pred_int == 1:
        prob = round(pred * 100, 2)
        op_str = 'The above comment is sarcastic with {} % confidence'.format(prob)  
    else:
        prob = round((1-pred) * 100, 2)
        op_str = 'The above comment is not sarcastic with {} % confidence'.format(prob)
    
    return op_str

In [19]:
# Pediction on input string
func1('hahaha so funny')

'The above comment is sarcastic with 97.46 % confidence'

In [20]:
func1('I liked the film')

'The above comment is not sarcastic with 78.49 % confidence'

In [21]:
def func2(X, Y):
    '''This function takes array of comments and their corresponding labels 
       as input and then makes predictions on them. After making predictions, 
       the predicted labels are comapared with true labels to give 
       accuracy, precision, recall and F1 score'''

    X_encoded = t.texts_to_sequences(X.iloc[:, 0])
    X_padded = pad_sequences(X_encoded, maxlen=30, dtype='int32', 
                             padding='post', truncating='post', value=0.0)
    
    pred = model.predict(X_padded)
    pred = np.where(pred >= 0.5, 1, 0)

    print("Classification Report")
    print()
    print(classification_report(Y, pred))

In [22]:
df = pd.read_csv('/content/drive/MyDrive/Case_Study_2/pre_processed2.csv', 
                 usecols = ['comment', 'label'])
X = df.drop(['label'], axis=1)
Y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, stratify=Y, random_state=42)

In [23]:
# Evaluation on train data
func2(X_train, y_train)

Classification Report

              precision    recall  f1-score   support

           0       0.76      0.80      0.78    403621
           1       0.79      0.74      0.76    404163

    accuracy                           0.77    807784
   macro avg       0.77      0.77      0.77    807784
weighted avg       0.77      0.77      0.77    807784



In [24]:
# Evaluation on test data
func2(X_test, y_test)

Classification Report

              precision    recall  f1-score   support

           0       0.72      0.76      0.74    100906
           1       0.75      0.71      0.73    101041

    accuracy                           0.74    201947
   macro avg       0.74      0.74      0.74    201947
weighted avg       0.74      0.74      0.74    201947

