In [1]:
import pandas as pd
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import pickle
import numpy as np

RANDOM_STATE = 50

In [16]:
TEXT_EMBEDDINGS = 'GLOVE' # {'BERT', 'GLOVE'}

In [17]:
if TEXT_EMBEDDINGS == 'BERT':
    with open('../bert embeddings/sarcasm_aug_embeddings.pkl', 'rb') as f:
        text_features_dict = pickle.load(f)

    text_features = []

    for i in text_features_dict:
        text_features.append(text_features_dict[i])

    text_features = np.array(text_features)

elif TEXT_EMBEDDINGS == 'GLOVE':
    textEmbeddingFile = '../glove embeddings/glove_mustard_aug.npy'
    text_features = np.load(textEmbeddingFile)
else:
    raise Exception("Invalid TEXT_EMBEDDINGS")

In [18]:
data = pd.read_csv('../datasets/mustard_dataset/sarcasm_with_id.csv')

labels = [int(k) for k in list(data['sarcasm'])]
len(labels)

1322

In [19]:
def svm_train(features,labels):
    clf = make_pipeline(
        StandardScaler(),
        svm.SVC(C=15.0, gamma="scale", kernel="rbf")
    )
    return clf.fit(features, labels)


def svm_test(clf,features,labels):
    pred = clf.predict(features)
    true = labels

    result_string = classification_report(true, pred, digits=3, output_dict=True)
    # print(confusion_matrix(true, pred))
    # print(result_string)
    return result_string

In [20]:
from sklearn.model_selection import StratifiedKFold

def kFoldResults(features, n_splits=5):

    kf = StratifiedKFold(n_splits, shuffle=True, random_state=RANDOM_STATE)

    results = []

    for train_index, test_index in kf.split(features, labels):
        train_x = [features[index] for index in train_index]
        train_y = [labels[index] for index in train_index]
        test_x = [features[index] for index in test_index]
        test_y = [labels[index] for index in test_index]
        clf = svm_train(train_x,train_y)
        resString = svm_test(clf,test_x,test_y)
        results.append([
            resString['macro avg']['precision'],
            resString['macro avg']['recall'],
            resString['macro avg']['f1-score'],
            resString['accuracy']
        ])

    res = np.mean(np.array(results), axis=0)
    print("Precision:",round(res[0], 4))
    print("recall:",round(res[1], 4))
    print("accuracy:",round(res[3], 4))
    print("f1-score:",round(res[2], 4))

In [21]:
print("Text Only")
kFoldResults(text_features)

Text Only
Precision: 0.7764
recall: 0.7746
accuracy: 0.7746
f1-score: 0.7742
