In [1]:
import numpy as np
import pandas as pd
import re
from keras.preprocessing.text import Tokenizer
from sklearn.utils import shuffle
from matplotlib import pyplot as plt
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout, Embedding, Masking, Bidirectional
from tensorflow.keras.optimizers import Adam
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
lem = WordNetLemmatizer()
RANDOM_STATE = 50
UNK_ID = 1
PAD_ID = 0
MAX_LEN = 20

2022-07-21 16:02:24.611163: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-07-21 16:02:24.611183: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vaibhav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/vaibhav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vaibhav/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/vaibhav/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def clean_string(s):    
    s =  re.sub(r'(?<=[^\s0-9])(?=[.,;?])', r' ', s)
    s = re.sub(r'\((\d+)\)', r'', s)
    s = re.sub(r'\s\s', ' ', s)
    s = re.sub(r"[^A-Za-z0-9(),!?\'`]", " ", s)
    s = re.sub(r"\'s", " \'s", s)
    s = re.sub(r"\'ve", " \'ve", s)
    s = re.sub(r"n\'t", " n\'t", s)
    s = re.sub(r"\'re", " \'re", s)
    s = re.sub(r"\'d", " \'d", s)
    s = re.sub(r"\'ll", " \'ll", s)
    s = re.sub(r",", " , ", s)
    s = re.sub(r"!", " ! ", s)
    s = re.sub(r"\"", " \" ", s)
    s = re.sub(r"\(", " ( ", s)
    s = re.sub(r"\)", " ) ", s)
    s = re.sub(r"\?", " ? ", s)
    s = re.sub(r"\s{2,}", " ", s)
    s = re.sub(r"\.", " . ", s)
    s = re.sub(r"., ", " , ", s)
    s = re.sub(r"\\n", " ", s)
    return s.strip().lower()

def create_train_valid(features,labels,train_fraction = 0.7,max_valid=500):

    features,labels = shuffle(features,labels,random_state = RANDOM_STATE)

    train_end = max(int(train_fraction*len(labels)),len(labels)-max_valid)

    train_features = np.asarray(features[:train_end])
    valid_features = np.asarray(features[train_end:])

    train_labels = np.asarray(labels[:train_end])
    valid_labels = np.asarray(labels[train_end:])
    
    return train_features,valid_features,train_labels,valid_labels

In [3]:
data = pd.read_csv('../datasets/temp.csv')

features = data['text'].to_list()
features = [str(word) for word in features]
labels = data['sarcasm'].to_list()

X_train, X_valid, y_train, y_valid = create_train_valid(features,labels)
training_dict = {'X_train': X_train, 'X_valid': X_valid,'y_train': y_train, 'y_valid': y_valid}



In [4]:
len(features)

990

In [5]:
glove_loc = '../glove.6B.100d.txt'
glove = np.loadtxt(glove_loc,dtype='str',comments=None)

vectors = glove[:,1:].astype('float')
words = glove[:,0]

del glove

In [6]:
vocab = dict()

for sentences in features:
    sentences = clean_string(sentences)
    sentences = nltk.word_tokenize(sentences)
    for word in sentences:
        vocab[word] = 1

In [7]:
word_lookup = {word:vector for word,vector in zip(words,vectors)}

word_index = dict()
ind = 2
not_found = 0

embeds = dict()

for i,word in enumerate(vocab.keys()):
    vector = word_lookup.get(word,None)

    if vector is not None:
        word_index[word] = ind
        embeds[ind] = np.copy(vector)
        ind+=1
    else:
        not_found +=1
        word_index[word] = UNK_ID

print(f'{not_found} words not found.')

153 words not found.


In [8]:
def word_to_index(utt1):
    utt2 = [word_index.get(word,UNK_ID) for word in nltk.word_tokenize(clean_string(utt1))]
    utt3 = utt2[:MAX_LEN]
    utt4 = utt3 + [PAD_ID]*(MAX_LEN - len(utt3))
    utt5 = np.mean([embeds[i] for i in utt4 if i>1],axis=0)
    if type(utt5) == np.float64:
        utt5 = np.random.rand(100)
    if type(utt5) == np.float64:
        print('here')
    return utt5


In [9]:
training_dict['X_train'] = [word_to_index(word) for word in training_dict['X_train']]
training_dict['X_valid'] = [word_to_index(word) for word in training_dict['X_valid']]

In [10]:
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.model_selection import KFold

In [11]:
def svm_train(features,labels):
    clf = make_pipeline(
        StandardScaler(),
        svm.SVC(C=10.0, gamma="scale", kernel="rbf")
    )
    return clf.fit(features, labels)

In [12]:
def svm_test(clf,features,labels):
    pred = clf.predict(features)
    true = labels

    result_string = classification_report(true, pred, digits=3)
    print(confusion_matrix(true, pred))
    print(result_string)

In [13]:
kf = KFold(n_splits=10)

for train_index, test_index in kf.split(features):
    train_x = [features[index] for index in train_index]
    train_x = [word_to_index(word) for word in train_x]
    train_y = [labels[index] for index in train_index]
    test_x = [features[index] for index in test_index]
    test_x = [word_to_index(word) for word in test_x]
    test_y = [labels[index] for index in test_index]
    clf = svm_train(train_x,train_y)
    svm_test(clf,test_x,test_y)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[[ 0  0]
 [ 1 98]]
              precision    recall  f1-score   support

           0      0.000     0.000     0.000         0
           1      1.000     0.990     0.995        99

    accuracy                          0.990        99
   macro avg      0.500     0.495     0.497        99
weighted avg      1.000     0.990     0.995        99



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[[ 0  0]
 [ 2 97]]
              precision    recall  f1-score   support

           0      0.000     0.000     0.000         0
           1      1.000     0.980     0.990        99

    accuracy                          0.980        99
   macro avg      0.500     0.490     0.495        99
weighted avg      1.000     0.980     0.990        99



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[[ 0  0]
 [ 3 96]]
              precision    recall  f1-score   support

           0      0.000     0.000     0.000         0
           1      1.000     0.970     0.985        99

    accuracy                          0.970        99
   macro avg      0.500     0.485     0.492        99
weighted avg      1.000     0.970     0.985        99

[[16 28]
 [11 44]]
              precision    recall  f1-score   support

           0      0.593     0.364     0.451        44
           1      0.611     0.800     0.693        55

    accuracy                          0.606        99
   macro avg      0.602     0.582     0.572        99
weighted avg      0.603     0.606     0.585        99

[[17 33]
 [ 8 41]]
              precision    recall  f1-score   support

           0      0.680     0.340     0.453        50
           1      0.554     0.837     0.667        49

    accuracy                          0.586        99
   macro avg      0.617     0.588     0.560        99
weighted avg     

In [15]:
clf = svm_train(training_dict['X_train'],training_dict['y_train'])

svm_test(clf,training_dict['X_valid'],training_dict['y_valid'])

[[ 49  43]
 [ 41 164]]
              precision    recall  f1-score   support

           0      0.544     0.533     0.538        92
           1      0.792     0.800     0.796       205

    accuracy                          0.717       297
   macro avg      0.668     0.666     0.667       297
weighted avg      0.716     0.717     0.716       297

