In [2]:
import numpy as np
import pandas as pd
import re
from keras.preprocessing.text import Tokenizer
from sklearn.utils import shuffle
from matplotlib import pyplot as plt
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout, Embedding, Masking, Bidirectional
from tensorflow.keras.optimizers import Adam
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
lem = WordNetLemmatizer()
RANDOM_STATE = 50
UNK_ID = 1
PAD_ID = 0
MAX_LEN = 20

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
def clean_string(s):    
    s =  re.sub(r'(?<=[^\s0-9])(?=[.,;?])', r' ', s)
    s = re.sub(r'\((\d+)\)', r'', s)
    s = re.sub(r'\s\s', ' ', s)
    s = re.sub(r"[^A-Za-z0-9(),!?\'`]", " ", s)
    s = re.sub(r"\'s", " \'s", s)
    s = re.sub(r"\'ve", " \'ve", s)
    s = re.sub(r"n\'t", " n\'t", s)
    s = re.sub(r"\'re", " \'re", s)
    s = re.sub(r"\'d", " \'d", s)
    s = re.sub(r"\'ll", " \'ll", s)
    s = re.sub(r",", " , ", s)
    s = re.sub(r"!", " ! ", s)
    s = re.sub(r"\"", " \" ", s)
    s = re.sub(r"\(", " ( ", s)
    s = re.sub(r"\)", " ) ", s)
    s = re.sub(r"\?", " ? ", s)
    s = re.sub(r"\s{2,}", " ", s)
    s = re.sub(r"\.", " . ", s)
    s = re.sub(r"., ", " , ", s)
    s = re.sub(r"\\n", " ", s)
    return s.strip().lower()

def create_train_valid(features,labels,train_fraction = 0.7,max_valid=1000):

    features,labels = shuffle(features,labels,random_state = RANDOM_STATE)

    train_end = max(int(train_fraction*len(labels)),len(labels)-max_valid)

    train_features = np.asarray(features[:train_end])
    valid_features = np.asarray(features[train_end:])

    train_labels = np.asarray(labels[:train_end])
    valid_labels = np.asarray(labels[train_end:])
    
    return train_features,valid_features,train_labels,valid_labels

In [7]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.2-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 5.4 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 39.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 54.7 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.21.2


In [8]:
import time
import torch
import numpy as np
from tqdm import  trange
from transformers import BertModel, BertTokenizer, BertConfig

class BertSentenceEncoder():
    def __init__(self, model_name='bert-base-cased'):
        '''
        Parameters
        ----------
        model_name : string, optional
            DESCRIPTION. The default is 'bert-base-cased'.
            
            Find a list of usable pre-trained bert models from:
                https://huggingface.co/transformers/pretrained_models.html
        '''

        self.model_name =   model_name
        self.config =       BertConfig.from_pretrained(self.model_name, output_hidden_states=True, training=True)
        self.model =        BertModel.from_pretrained(self.model_name, config=self.config)
        self.tokenizer =    BertTokenizer.from_pretrained(self.model_name, do_lower_case=False)
        self.pooling_methods = ['max', 'mean', 'max-mean']
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        # freeze parameters
        self.model.requires_grad_(False)        
        # move model to gpu , if one available:
        if torch.cuda.is_available():
            self.model.cuda()
            
    def __repr__(self):
        return 'BertSentenceEncoder model:{}'.format(self.model_name)
    
    def _mean_pooler(self, encoding):
        return encoding.mean(dim=1)
    
    def _max_pooler(self, encoding):
        return encoding.max(dim=1).values
    
    def _max_mean_pooler(self, encoding):
        return torch.cat((self._max_pooler(encoding), self._mean_pooler(encoding)), dim=1)
    
    def _pooler(self, encodings, pooling_method):
        '''
        Pools the encodings along the time/sequence axis according
        to one of the pooling method:
            - 'max'      :  max value along the sequence/time dimension
                            returns a (batch_size x hidden_size) shaped tensor
            - 'mean'     :  mean of the values along the sequence/time dimension
                            returns a (batch_size x hidden_size) shaped tensor
            - 'max-mean' :  max and mean values along the sequence/time dimension appended
                            returns a (batch_size x 2*hidden_size) shaped tensor
                            [ max : mean ]
        Parameters
        ----------
        encoding : list of tensor to pool along the sequence/time dimension.
        
        pooling_method : one of 'max', 'mean' or 'max-mean'
        
        Returns
        -------
        tensor of shape (batch_size x hidden_size).
        '''
        
        assert (pooling_method in self.pooling_methods), \
            "pooling methods needs to be one of 'max', 'mean' or 'max-mean'"
            
        if pooling_method   == 'max':       pool_fn = self._max_pooler
        elif pooling_method == 'mean':      pool_fn = self._mean_pooler
        elif pooling_method == 'max-mean':  pool_fn = self._max_mean_pooler
        
        pooled = pool_fn(encodings)
        
        return pooled
    

    
    def encoder(self, sentences, layer=-2, pooling_method = None, max_length=40 ):
     
        assert isinstance(sentences, list), \
            "parameter 'sentences' is supposed to be a list of string/s"
        assert all(isinstance(x, str) for x in sentences), \
            "parameter 'sentences' must contain strings only"
        
        '''
        model(input_tokens) returns a tuple of 3 elements.
        out[0] : last_hidden_state  of shape [ B x T x D ]
        out[1] : pooler_output      of shape [ B x D ]
        out[2] : hidden_states      13 tuples, one for each hidden layer
                                    each tuple of shape [ B x T x D ]        
        '''
        with torch.no_grad():
            input_ids = self.tokenizer.batch_encode_plus(sentences, return_tensors='pt', max_length=max_length, pad_to_max_length=True)['input_ids']
            input_ids = input_ids.to(self.device)
            encoded = self.model(input_ids)
                    
        if pooling_method in self.pooling_methods:
            pooled = self._pooler(encoded[2][layer], pooling_method)
            return pooled
        
        return encoded


def get_BE_batched(sentences, batch_size, BE=None):
    assert(BE), "Provide a BertSentenceEncoder object."
    l = len(sentences)
    embeddings = np.empty((0,768))    
    num_batches = int(l/batch_size) if l%batch_size==0 else int(l/batch_size)+1
    
    t = trange(num_batches, desc='Batch', leave=True)

    for i in t:
        # get start and end index for this batch
        if( i != int(l/batch_size) ):
            start   = (i*batch_size)
            end     = (i*batch_size)+batch_size   
        else:
            start   = int(l/batch_size)*batch_size
            end     = l
        t.set_description('Embedding batch => {} : {}'.format(start, end))
    
        s = time.time()
        batch_embeddings = BE.encoder(sentences[start:end], layer = -2, pooling_method='mean')
        e = time.time()    
        print("Time elapsed: {} seconds.".format(e-s))
        batch_embeddings = batch_embeddings.cpu().numpy()
        embeddings = np.append(embeddings, batch_embeddings, axis=0)
        
    return embeddings

In [9]:
data = pd.read_csv('/content/drive/MyDrive/btp/datasets/extended_mustard_aug.csv').dropna(axis=0,how='any')

features = data['text'].to_list()
labels = data['sarcasm'].to_list()


BE = BertSentenceEncoder(model_name='bert-base-uncased')

embeddings = []

for l in range(1,6):
    word_encodings = BE.encoder(features, layer = -l, pooling_method = 'mean')
    embeddings.append(word_encodings)

embeddings2 = np.zeros((len(embeddings),len(embeddings[0]),len(embeddings[0][0])))
for i in range(len(embeddings)):
  for j in range(len(embeddings[i])):
    embeddings2[i][j] = embeddings[i][j].numpy()

print(embeddings2.shape)

meaned = np.mean(embeddings2, axis=0)

x_train, x_valid, y_train, y_valid = create_train_valid(meaned,labels)


Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


(5, 2278, 768)


In [10]:
import pickle

with open('./extended_aug_embeddings','wb') as f: pickle.dump(embeddings2, f)

In [14]:
def svm_train(features,labels):
    clf = make_pipeline(
        StandardScaler(),
        svm.SVC(C=15.0, gamma="scale", kernel="rbf")
    )
    return clf.fit(features, labels)

In [12]:
def svm_test(clf,features,labels):
    pred = clf.predict(features)
    true = labels

    result_string = classification_report(true, pred, digits=3)
    print(confusion_matrix(true, pred))
    print(result_string)

In [15]:
clf = svm_train(x_train,y_train)

svm_test(clf,x_valid,y_valid);

[[284  53]
 [ 95 252]]
              precision    recall  f1-score   support

         0.0      0.749     0.843     0.793       337
         1.0      0.826     0.726     0.773       347

    accuracy                          0.784       684
   macro avg      0.788     0.784     0.783       684
weighted avg      0.788     0.784     0.783       684

