1. Few shot learning/one shot learning and semi supervised learning, pair wise learning
- One-shot learning/Few-shot learning: These techniques enable machine learning models to make accurate predictions or classifications with minimal training data, either from a single example (one-shot learning) or a small number of examples (few-shot learning).
- Pairwise learning: Pairwise learning involves training models based on comparisons between pairs of data points, often used in tasks like ranking or preference learning.
- Semi-supervised learning: Semi-supervised learning combines labeled and unlabeled data during training to improve model performance, particularly beneficial when labeled data is limited or expensive to obtain.

2. Siamese triplet loss
- $ L=max(0,d(A,P)−d(A,N)+α) $

Contrastive Loss
$  L = \frac{1}{2N} \sum_{i=1}^{N} \left( y_i \cdot d_i^2 + (1 - y_i) \cdot \max(margin - d_i, 0)^2 \right)  $

In [1]:
import torch
import torch.nn as nn


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd

df = pd.read_csv('TalkFile_ner_2.csv.csv').iloc[:300,:]
df['Tag'] = df['Tag'].apply(lambda x: eval(x))

In [3]:
list_all_tag = df.Tag.to_list()

In [4]:
from itertools import chain
list_labels = ['O'] + [i for i in list(set(chain.from_iterable(list_all_tag))) if i !='O']
label2ind = {}
ind2label = {}
for ind,i in enumerate(list_labels):
    label2ind[i]=ind
    ind2label[ind]=i
# df['Sentence'].to_list()
labels_ind_list = df['Tag'].apply(lambda x: 
                [label2ind[i] for i in x]
               ).to_list()

text_list = df['Sentence'].apply(lambda x:x.split(' ')).to_list()

data_dict = {'id':list(range(len(text_list))),'tokens':text_list,'ner_tags':labels_ind_list}


In [5]:
new_df = pd.DataFrame(data_dict)
new_df.head()

Unnamed: 0,id,tokens,ner_tags
0,0,"[Thousands, of, demonstrators, have, marched, ...","[0, 0, 0, 0, 0, 0, 11, 0, 0, 0, 0, 0, 11, 0, 0..."
1,1,"[Families, of, soldiers, killed, in, the, conf...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,"[They, marched, from, the, Houses, of, Parliam...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 9, 0]"
3,3,"[Police, put, the, number, of, marchers, at, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,4,"[The, protest, comes, on, the, eve, of, the, a...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0, 3,..."


In [6]:
new_df['tokens'].apply(lambda x: len(x)).max()

41

### T5 Similarity Finder

In [7]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')

In [8]:
# Tokenize and encode the input texts
inputs = tokenizer(df['Sentence'].to_list()[:260], return_tensors='pt', padding=True, truncation=True)

# Generate embeddings using T5 model (encoder outputs)
with torch.no_grad():
    outputs = model.encoder(**inputs, output_hidden_states=True)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Use embeddings from the last layer of the encoder

# Calculate cosine similarity using PyTorch's cosine similarity function
similarity = torch.nn.functional.cosine_similarity(embeddings[1,:], embeddings[5,:], dim=0)
similarity

tensor(0.9267)

In [9]:
df_triplet = pd.DataFrame({'anchor':[],'pos':[],'neg':[]})
for i in range(embeddings.shape[0]):
    similarity = torch.nn.functional.cosine_similarity(embeddings[i,:], embeddings[:,:], dim=1)
    pos_q1 = torch.quantile((similarity),0.7)
    pos_q2 = torch.quantile((similarity),0.9)
    neg_q1 = torch.quantile((similarity),0.1)
    neg_q2 = torch.quantile((similarity),0.3)
    pos_ts = ((similarity>=pos_q1)&(similarity<=pos_q2)).nonzero().view(-1)
    neg_ts = ((similarity>=neg_q1)&(similarity<=neg_q2)).nonzero().view(-1)
    df_triplet = df_triplet.append({'anchor':int(i),'pos':pos_ts.tolist(),'neg':neg_ts.tolist()},ignore_index=True)

import random
df_triplet['pos'] = df_triplet['pos'].apply(lambda x: random.sample(x,k=3))
df_triplet['neg'] = df_triplet['neg'].apply(lambda x: random.sample(x,k=3))

neg_list = df_triplet.explode('neg')['neg']
df_triplet = df_triplet.explode('pos')
df_triplet['neg']=neg_list
df_triplet

Unnamed: 0,anchor,pos,neg
0,0.0,79,208
0,0.0,225,17
0,0.0,49,13
1,1.0,31,240
1,1.0,210,225
...,...,...,...
258,258.0,188,87
258,258.0,235,117
259,259.0,63,174
259,259.0,132,253


In [10]:
df_triplet_s = pd.DataFrame({'anchor':[],'pos':[],'neg':[]})
for ind,i in df_triplet.iterrows():
    df_triplet_s = df_triplet_s.append({
        'anchor':df['Sentence'][i[0]],
        'pos':df['Sentence'][i[1]],
        'neg':df['Sentence'][i[2]]
    },ignore_index=True)
df_triplet_s.head(2)

Unnamed: 0,anchor,pos,neg
0,Thousands of demonstrators have marched throug...,Sudan 's government says it will order troops ...,Thai Deputy Interior Minister Sutham Saengprat...
1,Thousands of demonstrators have marched throug...,"There are only about 1,600 pandas still living...",Poor residents often complain they have been c...


In [11]:
from torch.utils.data import Dataset, DataLoader
class TokenizedSentencesDataset(Dataset):
  def __init__(self, sentences, tokenizer, max_length):
      self.tokenizer = tokenizer
      self.sentences = sentences
      self.max_length = max_length


  def __getitem__(self, item):
    anchor_tok= self.tokenizer(self.sentences['anchor'].to_list()[item], add_special_tokens=True, truncation=True, max_length=self.max_length, return_special_tokens_mask=True, return_tensors='pt',padding='max_length')
    pos_tok= self.tokenizer(self.sentences['pos'].to_list()[item], add_special_tokens=True, truncation=True, max_length=self.max_length, return_special_tokens_mask=True, return_tensors='pt',padding='max_length')
    neg_tok= self.tokenizer(self.sentences['neg'].to_list()[item], add_special_tokens=True, truncation=True, max_length=self.max_length, return_special_tokens_mask=True, return_tensors='pt',padding='max_length')
    return anchor_tok,pos_tok,neg_tok

  def __len__(self):
      return len(self.sentences['anchor'].to_list())

In [12]:
max_length = 50
train_dataset = TokenizedSentencesDataset(df_triplet_s, tokenizer, max_length)

dataloader_train = DataLoader(train_dataset,batch_size=50,shuffle=True)


### Train using distilbert and biLSTM

In [13]:
import transformers
tokenizer = transformers.AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')


In [14]:
import torch
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True,bidirectional = True)
    def forward(self, x):
        x = self.embedding(x).squeeze(dim=1)
        _, (h, c) = self.lstm(x)
        return h[-1]

class SiameseNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(SiameseNetwork, self).__init__()
        self.encoder = Encoder(input_size, hidden_size, num_layers)
    def forward(self, x1, x2,x3):
        h1 = self.encoder(x1)
        h2 = self.encoder(x2)
        h3 = self.encoder(x3)
        return h1, h2, h3
    def get_embeddings(self, x1):
        anchor = self.encoder(x1)
        return anchor


In [15]:
import torch
import torch.nn as nn
import torch.optim as optim

# Step 1: Define Loss Function
loss_function = nn.TripletMarginLoss(margin=1.0)

# Step 2: Prepare Data (Assuming triplets are prepared)

# Step 3: Instantiate Model and Loss Function
input_size = len(tokenizer)
hidden_size = 768  # Example hidden size
num_layers = 2    # Example number of layers
siamese_model = SiameseNetwork(input_size, hidden_size, num_layers)
optimizer = optim.Adam(siamese_model.parameters(), lr=0.001)

# Step 4: Training Loop
num_epochs = 30
for epoch in range(num_epochs):
    siamese_model.train()
    total_loss = 0.0
    
    
    for batch in dataloader_train:
        anchor, positive, negative = batch
        
        optimizer.zero_grad()
        
        h1, h2, h3 = siamese_model(anchor['input_ids'], positive['input_ids'], negative['input_ids'])
        
        loss = loss_function(h1, h2, h3)
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch+1}, Average Loss: {total_loss/len(dataloader_train)}")

IndexError: index out of range in self

In [None]:
for i in siamese_model.named_parameters():
    print(i[0],i[1].shape)

# Reassign the pre-trained weights to new model with linear layer.

In [None]:
class Encoder_lin(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers,num_classes):
        super(Encoder_lin, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True,bidirectional = True)
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(hidden_size*2,num_classes)
    def forward(self, x):
        x = self.embedding(x).squeeze(dim=1)
        h,_ = self.lstm(x)
        dropout_seq = self.dropout(h)

        logits = self.linear(dropout_seq)
        
        return logits
num_classes = 17
elin = Encoder_lin(input_size, hidden_size, num_layers,num_classes)


In [None]:
encoder_state_dict = elin.state_dict()
siamese_state_dict = siamese_model.encoder.state_dict()

In [None]:
# Update weights in encoder_lin_model with weights from siamese_state_dict
for name, param in siamese_state_dict.items():
    if name.startswith('lstm') or name.startswith('embedding'):  # Exclude linear layer weights
        print(name)
        encoder_state_dict[name].copy_(param)

In [None]:
elin.load_state_dict(encoder_state_dict)

# NER Downstream Task

In [None]:
# df['Sentence'].to_list()
labels_ind_list = df['Tag'].apply(lambda x: 
                [label2ind[i] for i in x]
               ).to_list()

text_list = df['Sentence'].apply(lambda x:x.split(' ')).to_list()

data_dict = {'id':list(range(len(text_list))),'tokens':text_list,'ner_tags':labels_ind_list}


In [None]:
new_df = pd.DataFrame(data_dict)
new_df.head()

In [None]:
new_df_padding = new_df.copy()

In [None]:
max_length = 50
new_df_padding['ner_tags'] = new_df_padding.ner_tags.apply(lambda x: x + [0 for i in range(max_length-len(x))])



In [None]:
from sklearn.model_selection import train_test_split
train_df,test_df = train_test_split(new_df,test_size=0.2,random_state=42)

In [None]:
# Define a custom PyTorch Dataset
class token_label_dataset(Dataset):
    def __init__(self, dataframe,tokenizer,max_length=50,num_class = 17):
        self.dataframe = dataframe
        self.max_length = max_length
        self.tokenizer= tokenizer
        self.num_class= num_class

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.tokenizer(' '.join(self.dataframe.iloc[idx]['tokens']), add_special_tokens=True, truncation=True, max_length=self.max_length, return_special_tokens_mask=True, return_tensors='pt',padding='max_length')
        labels = torch.tensor(self.dataframe.iloc[idx]['ner_tags'], dtype=torch.long)
        padded_labels = torch.nn.functional.pad(labels, (0, self.max_length - len(labels)))
        label_f = torch.nn.functional.one_hot(padded_labels, num_classes=self.num_class).permute(0, 1).float()
        len_tok = len(self.dataframe.iloc[idx]['tokens'])
        return text['input_ids'].squeeze(), label_f,len_tok

In [None]:
max_length=50
train_dataset2 = token_label_dataset(train_df,tokenizer,max_length,num_class=17)
test_dataset2 = token_label_dataset(test_df,tokenizer,max_length,num_class=17)

train_DL2 = DataLoader(train_dataset2,batch_size=50,shuffle=True)
test_DL2 = DataLoader(test_dataset2,batch_size=50,shuffle=True)

In [None]:
def evaluate_model(model, eval_loader):
    model.eval()
    correct_predictions = 0
    total_predictions = 0
    
    with torch.no_grad():
        for batch in eval_loader:
            input_ids, labels,len_tok = batch
            input_ids = input_ids
            labels = labels
            
            logits = model(input_ids)
            _, predicted = torch.max(logits, dim=2)  # Get predicted class indices
            _, labels = torch.max(labels, dim=2)  # Get predicted class indices

            max_len_tok = len_tok.max().tolist()
            # Compute accuracy for this batch
            correct_predictions += torch.sum(predicted[:,:max_len_tok] == labels[:,:max_len_tok]).item()
            total_predictions += labels.numel()  # Total number of elements
            
    accuracy = correct_predictions / total_predictions
    return accuracy

In [None]:
# Define a custom training function
def train_custom(model, train_loader, optimizer, loss_fn, eval_loader, num_epochs):

    model.train()
    for epoch in range(num_epochs):
        total_loss = 0.0
        for batch in train_loader:
            input_ids, labels,len_tok = batch
            optimizer.zero_grad()
            logits = model(input_ids)
            loss = loss_fn(logits, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        eval_accuracy = evaluate_model(model, eval_loader)
        print(f'Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}, Eval Accuracy: {eval_accuracy}')



In [None]:
# Define optimizer and loss function
optimizer = torch.optim.Adam(elin.parameters(), lr=1e-5)
loss_fn = nn.CrossEntropyLoss()
num_epochs = 30
train_custom(elin, train_DL2, optimizer, loss_fn,test_DL2, num_epochs)

In [None]:
#accuracy = 31