In [1]:
import numpy as np
import pandas as pd
import re
from keras.preprocessing.text import Tokenizer
from sklearn.utils import shuffle
from matplotlib import pyplot as plt
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout, Embedding, Masking, Bidirectional
from tensorflow.keras.optimizers import Adam
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
lem = WordNetLemmatizer()
RANDOM_STATE = 50
UNK_ID = 1
PAD_ID = 0
MAX_LEN = 20

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kusha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kusha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\kusha\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kusha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
def clean_string(s):    
    s =  re.sub(r'(?<=[^\s0-9])(?=[.,;?])', r' ', s)
    s = re.sub(r'\((\d+)\)', r'', s)
    s = re.sub(r'\s\s', ' ', s)
    s = re.sub(r"[^A-Za-z0-9(),!?\'`]", " ", s)
    s = re.sub(r"\'s", " \'s", s)
    s = re.sub(r"\'ve", " \'ve", s)
    s = re.sub(r"n\'t", " n\'t", s)
    s = re.sub(r"\'re", " \'re", s)
    s = re.sub(r"\'d", " \'d", s)
    s = re.sub(r"\'ll", " \'ll", s)
    s = re.sub(r",", " , ", s)
    s = re.sub(r"!", " ! ", s)
    s = re.sub(r"\"", " \" ", s)
    s = re.sub(r"\(", " ( ", s)
    s = re.sub(r"\)", " ) ", s)
    s = re.sub(r"\?", " ? ", s)
    s = re.sub(r"\s{2,}", " ", s)
    s = re.sub(r"\.", " . ", s)
    s = re.sub(r"., ", " , ", s)
    s = re.sub(r"\\n", " ", s)
    return s.strip().lower()

def create_train_valid(features,labels,video_features,train_fraction = 0.7,max_valid=500):

    features,labels,video_features = shuffle(features,labels,video_features,random_state = RANDOM_STATE)

    train_end = max(int(train_fraction*len(labels)),len(labels)-max_valid)

    train_features = np.asarray(features[:train_end])
    valid_features = np.asarray(features[train_end:])
    
    video_train_features = np.asarray(video_features[:train_end])
    video_valid_features = np.asarray(video_features[train_end:])

    train_labels = np.asarray(labels[:train_end])
    valid_labels = np.asarray(labels[train_end:])
    
    return train_features,valid_features,train_labels,valid_labels,video_train_features,video_valid_features

In [3]:
import pickle
import numpy as np

with open('../../video_features/resnet_features.pkl', 'rb') as f:
    video_features_dict = pickle.load(f)

for k in video_features_dict:
    video_features_dict[k] = np.mean(video_features_dict[k], axis=0)

video_features_dict[list(video_features_dict.keys())[0]].shape

(2048,)

In [4]:
data = pd.read_csv('../../datasets/mustard_dataset/sarcasm_with_id.csv')
video_features = []

ids = list(data['id'])

for i in ids:
    video_features.append(video_features_dict[i[:-2]])

video_features = np.array(video_features)
video_features.shape

(1322, 2048)

In [8]:
data = pd.read_csv('../../datasets/mustard_dataset/sarcasm_with_id.csv')

features = data['utterance'].to_list()
labels = data['sarcasm'].to_list()

X_train, X_valid, y_train, y_valid,video_train_features,video_valid_features = create_train_valid(features,labels,video_features)
training_dict = {'X_train': X_train, 'X_valid': X_valid,'y_train': y_train, 'y_valid': y_valid}

In [9]:
len(features)

1322

In [11]:
glove_loc = '../../glove.6B.100d.txt'
glove = np.loadtxt(glove_loc,dtype='str',comments=None,encoding='utf-8')

vectors = glove[:,1:].astype('float')
words = glove[:,0]

del glove

In [12]:
vocab = dict()

for sentences in features:
    sentences = clean_string(sentences)
    sentences = nltk.word_tokenize(sentences)
    for word in sentences:
        vocab[word] = 1

In [13]:
word_lookup = {word:vector for word,vector in zip(words,vectors)}

word_index = dict()
ind = 2
not_found = 0

embeds = dict()

for i,word in enumerate(vocab.keys()):
    vector = word_lookup.get(word,None)

    if vector is not None:
        word_index[word] = ind
        embeds[ind] = np.copy(vector)
        ind+=1
    else:
        not_found +=1
        word_index[word] = UNK_ID

print(f'{not_found} words not found.')

47 words not found.


In [14]:
def word_to_index(utt1):
    utt2 = [word_index.get(word,UNK_ID) for word in nltk.word_tokenize(clean_string(utt1))]
    utt3 = utt2[:MAX_LEN]
    utt4 = utt3 + [PAD_ID]*(MAX_LEN - len(utt3))
    utt5 = np.mean([embeds[i] for i in utt4 if i>1],axis=0)
    if type(utt5) == np.float64:
        utt5 = np.random.rand(100)
    if type(utt5) == np.float64:
        print('here')
    return utt5


In [15]:
training_dict['X_train'] = [word_to_index(word) for word in training_dict['X_train']]
training_dict['X_valid'] = [word_to_index(word) for word in training_dict['X_valid']]

In [16]:
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.model_selection import KFold

In [17]:
def svm_train(features,labels):
    clf = make_pipeline(
        StandardScaler(),
        svm.SVC(C=10.0, gamma="scale", kernel="rbf")
    )
    return clf.fit(features, labels)

In [18]:
def svm_test(clf,features,labels):
    pred = clf.predict(features)
    true = labels

    result_string = classification_report(true, pred, digits=3)
    print(confusion_matrix(true, pred))
    print(result_string)

In [19]:
training_dict2 = {}

training_dict2['X_train'] = np.concatenate((video_train_features, np.array(training_dict['X_train'])), axis=1)
training_dict2['X_valid'] = np.concatenate((video_valid_features, np.array(training_dict['X_valid'])), axis=1)
training_dict2['y_train'] = training_dict['y_train']
training_dict2['y_valid'] = training_dict['y_valid']

In [20]:
clf = svm_train(training_dict2['X_train'],training_dict2['y_train'])

svm_test(clf,training_dict2['X_valid'],training_dict2['y_valid'])

[[192  14]
 [ 18 173]]
              precision    recall  f1-score   support

       False      0.914     0.932     0.923       206
        True      0.925     0.906     0.915       191

    accuracy                          0.919       397
   macro avg      0.920     0.919     0.919       397
weighted avg      0.920     0.919     0.919       397



In [18]:
kf = KFold(n_splits=10)

for train_index, test_index in kf.split(features):
    train_x = [features[index] for index in train_index]
    train_x = [word_to_index(word) for word in train_x]
    train_y = [labels[index] for index in train_index]
    test_x = [features[index] for index in test_index]
    test_x = [word_to_index(word) for word in test_x]
    test_y = [labels[index] for index in test_index]
    clf = svm_train(train_x,train_y)
    svm_test(clf,test_x,test_y)


[[53 11]
 [ 7 67]]
              precision    recall  f1-score   support

           0      0.883     0.828     0.855        64
           1      0.859     0.905     0.882        74

    accuracy                          0.870       138
   macro avg      0.871     0.867     0.868       138
weighted avg      0.870     0.870     0.869       138

[[56 17]
 [ 7 58]]
              precision    recall  f1-score   support

           0      0.889     0.767     0.824        73
           1      0.773     0.892     0.829        65

    accuracy                          0.826       138
   macro avg      0.831     0.830     0.826       138
weighted avg      0.834     0.826     0.826       138

[[33  7]
 [15 83]]
              precision    recall  f1-score   support

           0      0.688     0.825     0.750        40
           1      0.922     0.847     0.883        98

    accuracy                          0.841       138
   macro avg      0.805     0.836     0.816       138
weighted avg     

In [None]:
# clf = svm_train(training_dict['X_train'],training_dict['y_train'])

# svm_test(clf,training_dict['X_valid'],training_dict['y_valid'])

[[151  44]
 [ 53 167]]
              precision    recall  f1-score   support

           0      0.740     0.774     0.757       195
           1      0.791     0.759     0.775       220

    accuracy                          0.766       415
   macro avg      0.766     0.767     0.766       415
weighted avg      0.767     0.766     0.766       415

