<a href="https://colab.research.google.com/github/tlokeshkumar1/nlp/blob/master/second.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
pip install -U sentence-transformers

In [None]:
#Python code for S-BERT-KG model for tweet classification
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.linear_model import LinearRegression

#Define function to obtain S-BERT embeddings for sentences
def obtain_sbert_embeddings(sentences, model, tokenizer):
# Tokenize the sentences
tokens = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
# Obtain the S-BERT embeddings for the tokens
with torch.no_grad():
embeddings = model(tokens['input_ids'], attention_mask=tokens['attention_mask'])[0]
return embeddings

#Define function to learn a least-squares linear projection matrix
def learn_projection_matrix(sbert_embeddings, kg_embeddings):
# Convert the embeddings to numpy arrays
sbert_embeddings = sbert_embeddings.detach().numpy()
kg_embeddings = np.array([kg_embeddings[word] for word in sbert_embeddings])
# Learn the projection matrix using linear regression
reg = LinearRegression().fit(sbert_embeddings, kg_embeddings)
projection_matrix = reg.coef_
return projection_matrix

#Define function for zero-shot text classification
def zero_shot_classification(tweet_embeddings, label_embeddings, projection_matrix):
# Project the tweet embeddings and label embeddings into the knowledge graph embedding space
projected_tweet_embeddings = np.dot(tweet_embeddings, projection_matrix)
projected_label_embeddings = np.dot(label_embeddings, projection_matrix)
# Calculate the cosine similarity between the projected tweet embeddings and label embeddings
cosine_similarities = np.dot(projected_tweet_embeddings, projected_label_embeddings.T) / (np.linalg.norm(projected_tweet_embeddings, axis=1)[:, np.newaxis] * np.linalg.norm(projected_label_embeddings, axis=1))
# Generate label predictions
label_predictions = np.argmax(cosine_similarities, axis=1)
return label_predictions

#Define main function
def main():
# Define tweet dataset and label names
Advice = ["['Stay at home']","['wash hands']","['wear mask']","['social distancing']"]
China = ["['Wuhan']","['China Coronavirus Updates']","['China news']","['other tweets related to China']"]
Mask = ["['Mask shortage']","['wear mask']","['mask types']","['N50']","['N95']","['3M8210']","['3M9001']","['3M9322']","['3M9501']"]
News = ["['Coronavirus updates']","['news']","['rules']"]
Transportation = ["['Flights']","['traffic']","['traveling']"]
USA = ["['U.S. Coronavirus Updates']","['COVID19']","['U.S. news']","['United States']","['US']","['USA']"]
Vaccine = ["['Vaccine news']","['vaccine progress']","['vaccine injection']"]
tweets_df1 = pd.read_csv('tweets1.csv')
tweets_df2 = pd.read_csv('tweets2.csv')
tweets_df = pd.concat([tweets_df1, tweets_df2], ignore_index=True)
hashtags = tweets_df['hashtags'].tolist()
tweets = []
for hashtag in hashtags:
hashtag_list = eval(hashtag)
tweet = ' '.join(hashtag_list)
tweets.append(tweet)
label_names = ["Vaccine","USA","Transportation","News","Mask","China","Advice"]
# Load pre-trained S-BERT model and tokenizer
model_name = 'sentence-transformers/bert-base-nli-mean-tokens'
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Obtain S-BERT embeddings for tweets and labels
tweet_embeddings = obtain_sbert_embeddings(tweets, model, tokenizer)
label_embeddings = obtain_sbert_embeddings(label_names, model, tokenizer)
# Load ConceptNet knowledge graph embeddings
kg_embedding_file = 'conceptnet_embedding.txt'
kg_embedding = {}
with open(kg_embedding_file, 'r', encoding='utf-8') as f:
for line in f:
values = line.split()
word = values[0]
embedding = np.asarray(values[1:], dtype='float32')
kg_embedding[word] = embedding
# Learn projection matrix
projection_matrix = learn_projection_matrix(tweet_embeddings, kg_embedding)
# Generate label predictions
label_predictions = zero_shot_classification(tweet_embeddings, label_embeddings, projection_matrix)
# Print label predictions
for i in range(len(tweets)):
print("Tweet: ", tweets[i])
print("Predicted label: ", label_names[label_predictions[i]])

#Call main function
if name == 'main':
main()

In [None]:
#Python code for using pre-trained word embeddings and S-BERT for zero-shot classification
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

#Define function to obtain pre-trained word embeddings for sentences
def obtain_word_embeddings(sentences, model, tokenizer):
# Tokenize the sentences
tokens = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
# Obtain the word embeddings for the tokens
with torch.no_grad():
embeddings = model(tokens['input_ids'], attention_mask=tokens['attention_mask'])[0]
return embeddings

#Define function for zero-shot classification using S-BERT
def zero_shot_classification(tweet_embeddings, label_embeddings, label_names):
# Calculate the cosine similarity between the tweet embeddings and label embeddings
cosine_similarities = cosine_similarity(tweet_embeddings, label_embeddings)
# Generate label predictions
label_predictions = np.argmax(cosine_similarities, axis=1)
label_predictions = [label_names[i] for i in label_predictions]
return label_predictions

#Define main function
def main():
# Define tweet dataset and label names
label_names = ["Vaccine","USA","Transportation","News","Mask","China","Advice"]
tweets_df1 = pd.read_csv('tweets1.csv')
tweets_df2 = pd.read_csv('tweets2.csv')
tweets_df = pd.concat([tweets_df1, tweets_df2], ignore_index=True)
hashtags = tweets_df['hashtags'].tolist()
tweets = []
for hashtag in hashtags:
hashtag_list = eval(hashtag)
tweet = ' '.join(hashtag_list)
tweets.append(tweet)
# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Obtain pre-trained word embeddings for tweets and labels
tweet_embeddings = obtain_word_embeddings(tweets, model, tokenizer)
label_embeddings = obtain_word_embeddings(label_names, model, tokenizer)
# Use S-BERT for zero-shot classification
label_predictions = zero_shot_classification(tweet_embeddings, label_embeddings, label_names)
# Print label predictions
for i in range(len(tweets)):
print("Tweet: ", tweets[i])
print("Predicted label: ", label_predictions[i])

#Call main function
if name == 'main':
main()

In [None]:
#Python code for using ConceptNet knowledge graph embeddings for zero-shot classification
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

#Define function to obtain pre-trained word embeddings for sentences
def obtain_word_embeddings(sentences, model, tokenizer):
# Tokenize the sentences
tokens = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
# Obtain the word embeddings for the tokens
with torch.no_grad():
embeddings = model(tokens['input_ids'], attention_mask=tokens['attention_mask'])[0]
return embeddings

#Define function to learn projection matrix using retrofitting
def learn_projection_matrix(tweet_embeddings, kg_embedding):
# Initialize word embeddings with ConceptNet embeddings
word_embeddings = kg_embedding.copy()
# Define hyperparameters
alpha = 0.5
beta = 0.5
num_iterations = 5
# Retrofit word embeddings with tweet embeddings
for i in range(num_iterations):
for word in word_embeddings:
neighbors = word_embeddings[word]
if word in tweet_embeddings:
neighbors = np.concatenate((neighbors, alphatweet_embeddings[word]), axis=0) new_embedding = np.mean(neighbors, axis=0) word_embeddings[word] = betakg_embedding[word] + (1-beta)*new_embedding
# Learn projection matrix
projection_matrix = np.linalg.pinv(word_embeddings)
return projection_matrix

#Define function for zero-shot classification using retrofitting and S-BERT
def zero_shot_classification(tweet_embeddings, label_embeddings, projection_matrix):
# Project tweet embeddings onto ConceptNet semantic space
tweet_embeddings = np.matmul(tweet_embeddings, projection_matrix)
# Calculate the cosine similarity between the tweet embeddings and label embeddings
cosine_similarities = cosine_similarity(tweet_embeddings, label_embeddings)
# Generate label predictions
label_predictions = np.argmax(cosine_similarities, axis=1)
return label_predictions

#Define main function
def main():
# Define tweet dataset and label names
label_names = ["Vaccine","USA","Transportation","News","Mask","China","Advice"]
tweets_df1 = pd.read_csv('tweets1.csv')
tweets_df2 = pd.read_csv('tweets2.csv')
tweets_df = pd.concat([tweets_df1, tweets_df2], ignore_index=True)
hashtags = tweets_df['hashtags'].tolist()
tweets = []
for hashtag in hashtags:
hashtag_list = eval(hashtag)
tweet = ' '.join(hashtag_list)
tweets.append(tweet)
# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Obtain pre-trained word embeddings for tweets
tweet_embeddings = obtain_word_embeddings(tweets, model, tokenizer)
# Load ConceptNet knowledge graph embeddings
kg_embedding_file = 'conceptnet_embedding.txt'
kg_embedding = {}
with open(kg_embedding_file, 'r', encoding='utf-8') as f:
for line in f:
values = line.split()
word = values[0]
embedding = np.asarray(values[1:], dtype='float32')
kg_embedding[word] = embedding
# Learn projection matrix using retrofitting
projection_matrix = learn_projection_matrix(tweet_embeddings, kg_embedding)
# Obtain ConceptNet embeddings for labels
label_embeddings = np.matmul(kg_embedding[label_names], projection_matrix)
# Use S-BERT for zero-shot classification
label_predictions = zero_shot_classification(tweet_embeddings, label_embeddings, projection_matrix)
# Print label predictions
for i in range(len(tweets)):
print("Tweet: ", tweets[i])
print("Predicted label: ", label_names[label_predictions[i]])

#Call main function
if name == 'main':
main()

In [None]:
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.linear_model import LinearRegression

#Define function to calculate knowledge graph embedding
def calculate_kg_embedding(omega, glove_embedding):
# Calculate the average of the GloVe embeddings for each word in the knowledge graph
kg_embedding = {}
for word in omega:
if word in glove_embedding:
kg_embedding[word] = np.mean(glove_embedding[word], axis=0)
return kg_embedding

#Define function to obtain S-BERT embeddings for words
def obtain_sbert_embeddings(words, model, tokenizer):
# Tokenize the words
tokens = tokenizer(words, padding=True, truncation=True, return_tensors="pt")
# Obtain the S-BERT embeddings for the tokens
with torch.no_grad():
embeddings = model(tokens['input_ids'], attention_mask=tokens['attention_mask'])[0]
return embeddings

#Define function to learn a least-squares linear projection matrix
def learn_projection_matrix(sbert_embeddings, kg_embeddings):
# Convert the embeddings to numpy arrays
sbert_embeddings = sbert_embeddings.detach().numpy()
kg_embeddings = np.array([kg_embeddings[word] for word in sbert_embeddings])
# Learn the projection matrix using linear regression
reg = LinearRegression().fit(sbert_embeddings, kg_embeddings)
projection_matrix = reg.coef_
return projection_matrix

#Define function for zero-shot text classification
def zero_shot_classification(tweet_embeddings, label_embeddings, projection_matrix):
# Project the tweet embeddings and label embeddings into the knowledge graph embedding space
projected_tweet_embeddings = np.dot(tweet_embeddings, projection_matrix)
projected_label_embeddings = np.dot(label_embeddings, projection_matrix)
# Calculate the cosine similarity between the projected tweet embeddings and label embeddings
cosine_similarities = np.dot(projected_tweet_embeddings, projected_label_embeddings.T) / (np.linalg.norm(projected_tweet_embeddings, axis=1)[:, np.newaxis] * np.linalg.norm(projected_label_embeddings, axis=1))
# Generate label predictions
label_predictions = np.argmax(cosine_similarities, axis=1)
return label_predictions

#Define main function
def main():
# Load pre-trained S-BERT model and tokenizer
model_name = 'sentence-transformers/bert-base-nli-mean-tokens'
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Load GloVe word embeddings
glove_embedding_file = 'glove.6B.300d.txt'
glove_embedding = {}
with open(glove_embedding_file, 'r', encoding='utf-8') as f:
for line in f:
values = line.split()
word = values[0]
embedding = np.asarray(values[1:], dtype='float32')
glove_embedding[word] = embedding
# Define knowledge graph
omega = ['knowledge', 'graph', 'embedding']
# Calculate knowledge graph embedding
kg_embedding = calculate_kg_embedding(omega, glove_embedding)
# Define vocabulary words
k = 100
vocabulary_words = list(model.tokenizer.get_vocab().keys())[:k]
# Obtain S-BERT embeddings for vocabulary words
sbert_embeddings = obtain_sbert_embeddings(vocabulary_words, model, tokenizer)
# Learn projection matrix
projection_matrix = learn_projection_matrix(sbert_embeddings, kg_embedding)
# Define tweet dataset and label names
tweet_dataset = ['This is a tweet about knowledge graphs', 'I love learning about embeddings', 'S-BERT is a great tool for natural language processing']
label_names = ['knowledge', 'graph', 'embedding', 'natural', 'language', 'processing']
# Obtain S-BERT embeddings for tweets and labels
tweet_embeddings = obtain_sbert_embeddings(tweet_dataset, model, tokenizer)
label_embeddings = obtain_sbert_embeddings(label_names, model, tokenizer)
# Generate label predictions
label_predictions = zero_shot_classification(tweet_embeddings, label_embeddings, projection_matrix)
print(label_predictions)

#Call main function
if name == 'main':
main()

In [None]:
#Python code for implementing baseline models and S-BERT-KG for zero-shot
Import necessary libraries
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine

#Define function to obtain pre-trained word embeddings for sentences
def obtain_word_embeddings(sentences, model, tokenizer):
# Tokenize the sentences
tokens = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
# Obtain the word embeddings for the tokens
with torch.no_grad():
embeddings = model(tokens['input_ids'], attention_mask=tokens['attention_mask'])[0]
return embeddings

#Define function for GloVe-AVG baseline model
def glove_avg_classification(tweet_embeddings, label_embeddings):
# Calculate the cosine similarity between the tweet embeddings and label embeddings
cosine_similarities = cosine_similarity(tweet_embeddings.mean(axis=1), label_embeddings)
# Generate label predictions
label_predictions = np.argmax(cosine_similarities, axis=1)
return label_predictions

#Define function for BERT-CLS baseline model
def bert_cls_classification(tweet_embeddings, label_embeddings):
# Calculate the cosine similarity between the tweet embeddings and label embeddings
cosine_similarities = cosine_similarity(tweet_embeddings[:, 0, :], label_embeddings)
# Generate label predictions
label_predictions = np.argmax(cosine_similarities, axis=1)
return label_predictions

#Define function for BERT-AVG baseline model
def bert_avg_classification(tweet_embeddings, label_embeddings):
# Calculate the cosine similarity between the tweet embeddings and label embeddings
cosine_similarities = cosine_similarity(tweet_embeddings.mean(axis=1), label_embeddings)
# Generate label predictions
label_predictions = np.argmax(cosine_similarities, axis=1)
return label_predictions

#Define function for S-BERT baseline model
def sbert_classification(tweet_embeddings, label_embeddings):
# Calculate the cosine similarity between the tweet embeddings and label embeddings
cosine_similarities = cosine_similarity(tweet_embeddings, label_embeddings)
# Generate label predictions
label_predictions = np.argmax(cosine_similarities, axis=1)
return label_predictions

#Define function for S-BERT-GloVe baseline model
def sbert_glove_classification(tweet_embeddings, label_embeddings, projection_matrix):
# Project S-BERT embeddings onto GloVe semantic space
tweet_embeddings = np.matmul(tweet_embeddings, projection_matrix)
label_embeddings = np.matmul(label_embeddings, projection_matrix)
# Calculate the cosine similarity between the tweet embeddings and label embeddings
cosine_similarities = cosine_similarity(tweet_embeddings, label_embeddings)
# Generate label predictions
label_predictions = np.argmax(cosine_similarities, axis=1)
return label_predictions

#Define function for BART-NLI baseline model
def bart_nli_classification(tweets, label_names, model, tokenizer):
# Construct hypothesis for each label
hypotheses = [f"The text is about {label}." for label in label_names]
# Tokenize the tweets and hypotheses
tokens = tokenizer(tweets, hypotheses, padding=True, truncation=True, return_tensors="pt")
# Obtain the logits for each label
with torch.no_grad():
outputs = model(input_ids=tokens['input_ids'], attention_mask=tokens['attention_mask'], labels=None)
logits = outputs.logits
# Generate label predictions
label_predictions = np.argmax(logits.detach().numpy(), axis=1)
return label_predictions

#Define function for S-BERT-KG model
def sbert_kg_classification(tweet_embeddings, label_embeddings, projection_matrix):
# Project S-BERT embeddings onto ConceptNet semantic space
tweet_embeddings = np.matmul(tweet_embeddings, projection_matrix)
label_embeddings = np.matmul(label_embeddings, projection_matrix)
# Calculate the cosine similarity between the tweet embeddings and label embeddings
cosine_similarities = cosine_similarity(tweet_embeddings, label_embeddings)
# Generate label predictions
label_predictions = np.argmax(cosine_similarities, axis=1)
return label_predictions

#Define main function
def main():
# Define tweet dataset and label names
label_names = ["Vaccine","USA","Transportation","News","Mask","China","Advice"]
tweets_df1 = pd.read_csv('tweets1.csv')
tweets_df2 = pd.read_csv('tweets2.csv')
tweets_df = pd.concat([tweets_df1, tweets_df2], ignore_index=True)
hashtags = tweets_df['hashtags'].tolist()
tweets = []
for hashtag in hashtags:
hashtag_list = eval(hashtag)
tweet = ' '.join(hashtag_list)
tweets.append(tweet)
# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name
# Load pre-trained GloVe embeddings
glove_embeddings = pd.read_csv('glove.6B.300d.txt', sep=" ", quoting=3, header=None, index_col=0)
glove_embeddings = {key: val.values for key, val in glove_embeddings.T.items()}
# Obtain GloVe embeddings for label names
label_embeddings = np.array([glove_embeddings[label] for label in label_names])
# Obtain word embeddings for tweets
tweet_embeddings = obtain_word_embeddings(tweets, model, tokenizer)
# Obtain projection matrix for S-BERT-GloVe and S-BERT-KG models
projection_matrix = np.load('projection_matrix.npy')
# Generate label predictions for each baseline model and S-BERT-KG model
glove_avg_predictions = glove_avg_classification(tweet_embeddings, label_embeddings)
bert_cls_predictions = bert_cls_classification(tweet_embeddings, label_embeddings)
bert_avg_predictions = bert_avg_classification(tweet_embeddings, label_embeddings)
sbert_predictions = sbert_classification(tweet_embeddings, label_embeddings)
sbert_glove_predictions = sbert_glove_classification(tweet_embeddings, label_embeddings, projection_matrix)
bart_nli_predictions = bart_nli_classification(tweets, label_names, model, tokenizer)
sbert_kg_predictions = sbert_kg_classification(tweet_embeddings, label_embeddings, projection_matrix)
# Print accuracy scores for each model
print("Accuracy scores for baseline models:")
print("GloVe-AVG:", np.mean(glove_avg_predictions == tweets_df['label']))
print("BERT-CLS:", np.mean(bert_cls_predictions == tweets_df['label']))
print("BERT-AVG:", np.mean(bert_avg_predictions == tweets_df['label']))
print("S-BERT:", np.mean(sbert_predictions == tweets_df['label']))
print("S-BERT-GloVe:", np.mean(sbert_glove_predictions == tweets_df['label']))
print("BART-NLI:", np.mean(bart_nli_predictions == tweets_df['label']))
print("Accuracy score for S-BERT-KG model:", np.mean(sbert_kg_predictions == tweets_df['label']))


In [None]:
#Python code for comparing performance of different models for zero-shot multiclass and multilabel classification
import pandas as pd
import numpy as np
import time
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, hamming_loss
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer

#Define function to obtain pre-trained word embeddings for sentences
def obtain_word_embeddings(sentences, model, tokenizer):
# Tokenize the sentences
tokens = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
# Obtain the word embeddings for the tokens
with torch.no_grad():
embeddings = model(tokens['input_ids'], attention_mask=tokens['attention_mask'])[0]
return embeddings

#Define function for GloVe-AVG baseline model
def glove_avg_classification(tweet_embeddings, label_embeddings):
# Calculate the cosine similarity between the tweet embeddings and label embeddings
cosine_similarities = cosine_similarity(tweet_embeddings.mean(axis=1), label_embeddings)
# Generate label predictions
label_predictions = np.argmax(cosine_similarities, axis=1)
return label_predictions

#Define function for BERT-CLS baseline model
def bert_cls_classification(tweet_embeddings, label_embeddings):
# Calculate the cosine similarity between the tweet embeddings and label embeddings
cosine_similarities = cosine_similarity(tweet_embeddings[:, 0, :], label_embeddings)
# Generate label predictions
label_predictions = np.argmax(cosine_similarities, axis=1)
return label_predictions

#Define function for BERT-AVG baseline model
def bert_avg_classification(tweet_embeddings, label_embeddings):
# Calculate the cosine similarity between the tweet embeddings and label embeddings
cosine_similarities = cosine_similarity(tweet_embeddings.mean(axis=1), label_embeddings)
# Generate label predictions
label_predictions = np.argmax(cosine_similarities, axis=1)
return label_predictions

#Define function for S-BERT baseline model
def sbert_classification(tweet_embeddings, label_embeddings):
# Calculate the cosine similarity between the tweet embeddings and label embeddings
cosine_similarities = cosine_similarity(tweet_embeddings, label_embeddings)
# Generate label predictions
label_predictions = np.argmax(cosine_similarities, axis=1)
return label_predictions

#Define function for S-BERT-GloVe baseline model
def sbert_glove_classification(tweet_embeddings, label_embeddings, projection_matrix):
# Project S-BERT embeddings onto GloVe semantic space
tweet_embeddings = np.matmul(tweet_embeddings, projection_matrix)
label_embeddings = np.matmul(label_embeddings, projection_matrix)
# Calculate the cosine similarity between the tweet embeddings and label embeddings
cosine_similarities = cosine_similarity(tweet_embeddings, label_embeddings)
# Generate label predictions
label_predictions = np.argmax(cosine_similarities, axis=1)
return label_predictions

#Define function for BART-NLI baseline model
def bart_nli_classification(tweets, label_names, model, tokenizer):
# Construct hypothesis for each label
hypotheses = [f"The text is about {label}." for label in label_names]
# Tokenize the tweets and hypotheses
tokens = tokenizer(tweets, hypotheses, padding=True, truncation=True, return_tensors="pt")
# Obtain the logits for each label
with torch.no_grad():
outputs = model(input_ids=tokens['input_ids'], attention_mask=tokens['attention_mask'], labels=None)
logits = outputs.logits
# Generate label predictions
label_predictions = np.argmax(logits.detach().numpy(), axis=1)
return label_predictions

#Define function for S-BERT-KG model
def sbert_kg_classification(tweet_embeddings, label_embeddings, projection_matrix):
# Project S-BERT embeddings onto ConceptNet semantic space
tweet_embeddings = np.matmul(tweet_embeddings, projection_matrix)
label_embeddings = np.matmul(label_embeddings, projection_matrix)
# Calculate the cosine similarity between the tweet embeddings and label embeddings
cosine_similarities = cosine_similarity(tweet_embeddings, label_embeddings)
# Generate label predictions
label_predictions = np.argmax(cosine_similarities, axis=1)
return label_predictions

#Define function for evaluating model performance
def evaluate_model_performance(model, tweets, labels, label_names, mlb):
# Obtain word embeddings for tweets
tweet_embeddings = model.encode(tweets)
# Obtain label embeddings
label_embeddings = np.array([model.encode([label])[0] for label in label_names])
# Generate label predictions
label_predictions = model(tweet_embeddings, label_embeddings)
# Convert label predictions to binary format for multilabel classification
label_predictions_binary = mlb.transform(label_predictions)
# Calculate evaluation metrics
precision, recall, f1, _ = precision_recall_fscore_support(labels, label_predictions_binary, average='weighted')
accuracy= accuracy_score(labels, label_predictions)
hamming_loss_value = hamming_loss(labels, label_predictions_binary)
running_time = time.time() - start_time
# Return evaluation metrics
return precision, recall, f1, accuracy, hamming_loss_value, running_time
Load pre-trained models and tokenizers
glove_model = pd.read_csv('glove.6B.300d.txt', sep=" ", index_col=0, header=None, quoting=3)
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert_cls_model = AutoModel.from_pretrained('bert-base-uncased')
bert_avg_model = AutoModel.from_pretrained('bert-base-uncased')
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
sbert_glove_model = SentenceTransformer('bert-base-nli-mean-tokens')
sbert_kg_model = SentenceTransformer('bert-base-nli-mean-tokens')
bart_nli_model = AutoModel.from_pretrained('facebook/bart-large-mnli')

#Load data
tweets = pd.read_csv('tweets.csv')
labels = pd.read_csv('labels.csv')

#Define label names
label_names = ['politics', 'sports', 'entertainment', 'technology', 'business', 'health', 'education']

#Convert labels to binary format for multilabel classification
mlb = MultiLabelBinarizer()
labels_binary = mlb.fit_transform(labels)

#Obtain word embeddings for GloVe model
glove_embeddings = glove_model.loc[label_names].values

#Obtain word embeddings for BERT-CLS and BERT-AVG models
bert_cls_embeddings = obtain_word_embeddings(label_names, bert_cls_model, tokenizer)
bert_avg_embeddings = obtain_word_embeddings(label_names, bert_avg_model, tokenizer)

#Obtain word embeddings for S-BERT, S-BERT-GloVe, and S-BERT-KG models
sbert_embeddings = sbert_model.encode(label_names)
sbert_glove_projection_matrix = np.random.rand(768, 20000)
sbert_glove_embeddings = sbert_glove_model.encode(label_names)
sbert_kg_projection_matrix = np.random.rand(768, 20000)
sbert_kg_embeddings = sbert_kg_model.encode(label_names)

#Obtain word embeddings for BART-NLI model
bart_nli_embeddings = obtain_word_embeddings(label_names, bart_nli_model, tokenizer)

#Evaluate performance of GloVe model
glove_precision, glove_recall, glove_f1, glove_accuracy, glove_hamming_loss, glove_running_time = evaluate_model_performance(glove_avg_classification, tweets, labels_binary, label_names, mlb)

#Evaluate performance of BERT-CLS model
bert_cls_precision, bert_cls_recall, bert_cls_f1, bert_cls_accuracy, bert_cls_hamming_loss, bert_cls_running_time = evaluate_model_performance(bert_cls_classification, tweets, labels_binary, bert_cls_embeddings, mlb)

#Evaluate performance of BERT-AVG model
bert_avg_precision, bert_avg_recall, bert_avg_f1, bert_avg_accuracy, bert_avg_hamming_loss, bert_avg_running_time = evaluate_model_performance(bert_avg_classification, tweets, labels_binary, bert_avg_embeddings, mlb)

#Evaluate performance of S-BERT model
sbert_precision, sbert_recall, sbert_f1, sbert_accuracy, sbert_hamming_loss, sbert_running_time = evaluate_model_performance(sbert_classification, tweets, labels_binary, sbert_embeddings, mlb)

#Evaluate performance of S-BERT-GloVe model
sbert_glove_precision, sbert_glove_recall, sbert_glove_f1, sbert_glove_accuracy, sbert_glove_hamming_loss, sbert_glove_running_time = evaluate_model_performance(sbert_glove_classification, tweets, labels_binary, sbert_glove_embeddings, mlb)

#Evaluate performance of BART-NLI model
bart_nli_precision, bart_nli_recall, bart_nli_f1, bart_nli_accuracy, bart_nli_hamming_loss, bart_nli_running_time = evaluate_model_performance(bart_nli_classification, tweets, labels_binary, label_names, mlb)

#Evaluate performance of S-BERT-KG model
sbert_kg_precision, sbert_kg_recall, sbert_kg_f1, sbert_kg_accuracy, sbert_kg_hamming_loss, sbert_kg_running_time = evaluate_model_performance(sbert_kg_classification, tweets, labels_binary, sbert_kg_embeddings, mlb)

#Create dataframe to store evaluation metrics
evaluation_metrics = pd.DataFrame({'Model': ['GloVe', 'BERT-CLS', 'BERT-AVG', 'S-BERT', 'S-BERT-GloVe', 'BART-NLI', 'S-BERT-KG'],
'Precision': [glove_precision, bert_cls_precision, bert_avg_precision, sbert_precision, sbert_glove_precision, bart_nli_precision, sbert_kg_precision],
'Recall': [glove_recall, bert_cls_recall, bert_avg_recall, sbert_recall, sbert_glove_recall, bart_nli_recall, sbert_kg_recall],
'F1 Score': [glove_f1, bert_cls_f1, bert_avg_f1, sbert_f1, sbert_glove_f1, bart_nli_f1, sbert_kg_f1],
'Accuracy': [glove_accuracy, bert_cls_accuracy, bert_avg_accuracy, sbert_accuracy, sbert_glove_accuracy, bart_nli_accuracy, sbert_kg_accuracy],
'Hamming Loss': [glove_hamming_loss, bert_cls_hamming_loss, bert_avg_hamming_loss, sbert_hamming_loss, sbert_glove_hamming_loss, bart_nli_hamming_loss, sbert_kg_hamming_loss],
'Running Time (s)': [glove_running_time, bert_cls_running_time, bert_avg_running_time, sbert_running_time, sbert_glove_running_time, bart_nli_running_time, sbert_kg_running_time]})

#Print evaluation metrics for multiclass classification
print("Evaluation Metrics for Zero-Shot Multiclass Classification:")
print(evaluation_metrics)

#Evaluate performance of models for multilabel classification
glove_precision, glove_recall, glove_f1, glove_accuracy, glove_hamming_loss, glove_running_time = evaluate_model_performance(glove_avg_classification, tweets, labels_binary, label_names, mlb)
bert_cls_precision, bert_cls_recall, bert_cls_f1, bert_cls_accuracy, bert_cls_hamming_loss, bert_cls_running_time = evaluate_model_performance(bert_cls_classification, tweets, labels_binary, bert_cls_embeddings, mlb)
bert_avg_precision, bert_avg_recall, bert_avg_f1, bert_avg_accuracy, bert_avg_hamming_loss, bert_avg_running_time = evaluate_model_performance(bert_avg_classification, tweets, labels_binary, bert_avg_embeddings, mlb)
sbert_precision, sbert_recall, sbert_f1, sbert_accuracy, sbert_hamming_loss, sbert_running_time = evaluate_model_performance(sbert_classification, tweets, labels_binary, sbert_embeddings, mlb)
sbert_glove_precision, sbert_glove_recall, sbert_glove_f1, sbert_glove_accuracy, sbert_glove_hamming_loss, sbert_glove_running_time = evaluate_model_performance(sbert_glove_classification, tweets, labels_binary, sbert_glove_embeddings, mlb)
bart_nli_precision, bart_nli_recall, bart_nli_f1, bart_nli_accuracy, bart_nli_hamming_loss, bart_nli_running_time = evaluate_model_performance(bart_nli_classification, tweets, labels_binary, label_names, mlb)
sbert_kg_precision, sbert_kg_recall, sbert_kg_f1, sbert_kg_accuracy, sbert_kg_hamming_loss, sbert_kg_running_time = evaluate_model_performance(sbert_kg_classification, tweets, labels_binary, sbert_kg_embeddings, mlb)

#Create dataframe to store evaluation metrics
evaluation_metrics = pd.DataFrame({'Model': ['GloVe', 'BERT-CLS', 'BERT-AVG', 'S-BERT', 'S-BERT-GloVe', 'BART-NLI', 'S-BERT-KG'],
'Precision': [glove_precision, bert_cls_precision, bert_avg_precision, sbert_precision, sbert_glove_precision, bart_nli_precision, sbert_kg_precision],
'Recall': [glove_recall, bert_cls_recall, bert_avg_recall, sbert_recall, sbert_glove_recall, bart_nli_recall, sbert_kg_recall],
'F1 Score': [glove_f1, bert_cls_f1, bert_avg_f1, sbert_f1, sbert_glove_f1, bart_nli_f1, sbert_kg_f1],
'Accuracy': [glove_accuracy, bert_cls_accuracy, bert_avg_accuracy, sbert_accuracy, sbert_glove_accuracy, bart_nli_accuracy, sbert_kg_accuracy],
'Hamming Loss': [glove_hamming_loss, bert_cls_hamming_loss, bert_avg_hamming_loss, sbert_hamming_loss, sbert_glove_hamming_loss, bart_nli_hamming_loss, sbert_kg_hamming_loss],
'Running Time (s)': [glove_running_time, bert_cls_running_time, bert_avg_running_time, sbert_running_time, sbert_glove_running_time, bart_nli_running_time, sbert_kg_running_time]})

#Print evaluation metrics for multilabel classification
print("Evaluation Metrics for Zero-Shot Multilabel Classification:")
print(evaluation_metrics)

In [None]:
#Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, hamming_loss
from sentence_transformers import SentenceTransformer
from transformers import AutoModel, AutoTokenizer
from scipy.spatial.distance import cosine
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

#Load data
tweets = pd.read_csv('tweets.csv')
labels = pd.read_csv('labels.csv')

#Define label names
label_names = ['politics', 'sports', 'entertainment', 'technology', 'business', 'health', 'education']

#Convert labels to binary format for multilabel classification
mlb = MultiLabelBinarizer()
labels_binary = mlb.fit_transform(labels)

#Obtain word embeddings for GloVe model
glove_embeddings = glove_model.loc[label_names].values

#Obtain word embeddings for BERT-CLS and BERT-AVG models
bert_cls_model = AutoModel.from_pretrained('bert-base-uncased')
bert_cls_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert_cls_embeddings = obtain_word_embeddings(label_names, bert_cls_model, bert_cls_tokenizer)
bert_avg_model = AutoModel.from_pretrained('bert-base-uncased')
bert_avg_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert_avg_embeddings = obtain_word_embeddings(label_names, bert_avg_model, bert_avg_tokenizer)

#Obtain word embeddings for S-BERT, S-BERT-GloVe, and S-BERT-KG models
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
sbert_embeddings = sbert_model.encode(label_names)
sbert_glove_projection_matrix = np.random.rand(768, 20000)
sbert_glove_embeddings = sbert_glove_model.encode(label_names)
sbert_kg_projection_matrix = np.random.rand(768, 20000)
sbert_kg_embeddings = sbert_kg_model.encode(label_names)

#Obtain word embeddings for BART-NLI model
bart_nli_model = AutoModel.from_pretrained('facebook/bart-large-nli-stsb-mean-tokens')
bart_nli_tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-nli-stsb-mean-tokens')
bart_nli_embeddings = obtain_word_embeddings(label_names, bart_nli_model, bart_nli_tokenizer)

#Evaluate performance of models for multiclass classification
glove_precision, glove_recall, glove_f1, glove_accuracy, glove_hamming_loss, glove_running_time = evaluate_model_performance(glove_avg_classification, tweets, labels_binary, label_names, mlb)
bert_cls_precision, bert_cls_recall, bert_cls_f1, bert_cls_accuracy, bert_cls_hamming_loss, bert_cls_running_time = evaluate_model_performance(bert_cls_classification, tweets, labels_binary, bert_cls_embeddings, mlb)
bert_avg_precision, bert_avg_recall, bert_avg_f1, bert_avg_accuracy, bert_avg_hamming_loss, bert_avg_running_time = evaluate_model_performance(bert_avg_classification, tweets, labels_binary, bert_avg_embeddings, mlb)
sbert_precision, sbert_recall, sbert_f1, sbert_accuracy, sbert_hamming_loss, sbert_running_time = evaluate_model_performance(sbert_classification, tweets, labels_binary, sbert_embeddings, mlb)
sbert_glove_precision, sbert_glove_recall, sbert_glove_f1, sbert_glove_accuracy, sbert_glove_hamming_loss, sbert_glove_running_time = evaluate_model_performance(sbert_glove_classification, tweets, labels_binary, sbert_glove_embeddings, mlb)
bart_nli_precision, bart_nli_recall, bart_nli_f1, bart_nli_accuracy, bart_nli_hamming_loss, bart_nli_running_time = evaluate_model_performance(bart_nli_classification, tweets, labels_binary, label_names, mlb)
sbert_kg_precision, sbert_kg_recall, sbert_kg_f1, sbert_kg_accuracy, sbert_kg_hamming_loss, sbert_kg_running_time = evaluate_model_performance(sbert_kg_classification, tweets, labels_binary, sbert_kg_embeddings, mlb)

#Create dataframe to store evaluation metrics
evaluation_metrics = pd.DataFrame({'Model': ['GloVe', 'BERT-CLS', 'BERT-AVG', 'S-BERT', 'S-BERT-GloVe', 'BART-NLI', 'S-BERT-KG'],
'Precision': [glove_precision, bert_cls_precision, bert_avg_precision, sbert_precision, sbert_glove_precision, bart_nli_precision, sbert_kg_precision],
'Recall': [glove_recall, bert_cls_recall, bert_avg_recall, sbert_recall, sbert_glove_recall, bart_nli_recall, sbert_kg_recall],
'F1 Score':[glove_f1, bert_cls_f1, bert_avg_f1,ert_f1, sbert_glove_f1, bart_nli_f1, sbert_kg_f1],
'Accuracy': [glove_accuracy, bert_cls_accuracy, bert_avg_accuracy, sbert_accuracy, sbert_glove_accuracy, bart_nli_accuracy, sbert_kg_accuracy],
'Hamming Loss': [glove_hamming_loss, bert_cls_hamming_loss, bert_avg_hamming_loss, sbert_hamming_loss, sbert_glove_hamming_loss, bart_nli_hamming_loss, sbert_kg_hamming_loss],
'Running Time (s)': [glove_running_time, bert_cls_running_time, bert_avg_running_time, sbert_running_time, sbert_glove_running_time, bart_nli_running_time, sbert_kg_running_time]})

#Print evaluation metrics for multiclass classification
print("Evaluation Metrics for Zero-Shot Multiclass Classification:")
print(evaluation_metrics)

#Evaluate performance of models for multilabel classification
glove_precision, glove_recall, glove_f1, glove_accuracy, glove_hamming_loss, glove_running_time = evaluate_model_performance(glove_avg_classification, tweets, labels_binary, label_names, mlb)
bert_cls_precision, bert_cls_recall, bert_cls_f1, bert_cls_accuracy, bert_cls_hamming_loss, bert_cls_running_time = evaluate_model_performance(bert_cls_classification, tweets, labels_binary, bert_cls_embeddings, mlb)
bert_avg_precision, bert_avg_recall, bert_avg_f1, bert_avg_accuracy, bert_avg_hamming_loss, bert_avg_running_time = evaluate_model_performance(bert_avg_classification, tweets, labels_binary, bert_avg_embeddings, mlb)
sbert_precision, sbert_recall, sbert_f1, sbert_accuracy, sbert_hamming_loss, sbert_running_time = evaluate_model_performance(sbert_classification, tweets, labels_binary, sbert_embeddings, mlb)
sbert_glove_precision, sbert_glove_recall, sbert_glove_f1, sbert_glove_accuracy, sbert_glove_hamming_loss, sbert_glove_running_time = evaluate_model_performance(sbert_glove_classification, tweets, labels_binary, sbert_glove_embeddings, mlb)
bart_nli_precision, bart_nli_recall, bart_nli_f1, bart_nli_accuracy, bart_nli_hamming_loss, bart_nli_running_time = evaluate_model_performance(bart_nli_classification, tweets, labels_binary, label_names, mlb)
sbert_kg_precision, sbert_kg_recall, sbert_kg_f1, sbert_kg_accuracy, sbert_kg_hamming_loss, sbert_kg_running_time = evaluate_model_performance(sbert_kg_classification, tweets, labels_binary, sbert_kg_embeddings, mlb)

#Create dataframe to store evaluation metrics
evaluation_metrics = pd.DataFrame({'Model': ['GloVe', 'BERT-CLS', 'BERT-AVG', 'S-BERT', 'S-BERT-GloVe', 'BART-NLI', 'S-BERT-KG'],
'Precision': [glove_precision, bert_cls_precision, bert_avg_precision, sbert_precision, sbert_glove_precision, bart_nli_precision, sbert_kg_precision],
'Recall': [glove_recall, bert_cls_recall, bert_avg_recall, sbert_recall, sbert_glove_recall, bart_nli_recall, sbert_kg_recall],
'F1 Score': [glove_f1, bert_cls_f1, bert_avg_f1, sbert_f1, sbert_glove_f1, bart_nli_f1, sbert_kg_f1],
'Accuracy': [glove_accuracy, bert_cls_accuracy, bert_avg_accuracy, sbert_accuracy, sbert_glove_accuracy, bart_nli_accuracy, sbert_kg_accuracy],
'Hamming Loss': [glove_hamming_loss, bert_cls_hamming_loss, bert_avg_hamming_loss, sbert_hamming_loss, sbert_glove_hamming_loss, bart_nli_hamming_loss, sbert_kg_hamming_loss],
'Running Time (s)': [glove_running_time, bert_cls_running_time, bert_avg_running_time, sbert_running_time, sbert_glove_running_time, bart_nli_running_time, sbert_kg_running_time]})

#Print evaluation metrics for multilabel classification
print("Evaluation Metrics for Zero-Shot Multilabel Classification:")
print(evaluation_metrics)

#Generate t-SNE visualization of sentence and label embeddings
tsne = TSNE(n_components=2, random_state=42)

#Obtain embeddings for GloVe-AVG model
glove_avg_embeddings = glove_model.loc[tweets.index].values

#Obtain embeddings for S-BERT model
sbert_embeddings = sbert_model.encode(tweets)

#Obtain embeddings for S-BERT-GloVe model
sbert_glove_embeddings = sbert_glove_model.encode(tweets)

#Obtain embeddings for S-BERT-KG model
sbert_kg_embeddings = sbert_kg_model.encode(tweets)

#Obtain t-SNE visualization for GloVe-AVG model
glove_avg_tsne = tsne.fit_transform(glove_avg_embeddings)
plt.scatter(glove_avg_tsne[:, 0], glove_avg_tsne[:, 1], c=labels_binary)
plt.title("t-SNE Visualization of GloVe-AVG Embeddings")
plt.show()

#Obtain t-SNE visualization for S-BERT model
sbert_tsne = tsne.fit_transform(sbert_embeddings)
plt.scatter(sbert_tsne[:, 0], sbert_tsne[:, 1], c=labels_binary)
plt.title("t-SNE Visualization of S-BERT Embeddings")
plt.show()

#Obtain t-SNE visualization for S-BERT-KG model
sbert_kg_tsne = tsne.fit_transform(sbert_kg_embeddings)
plt.scatter(sbert_kg_tsne[:, 0], sbert_kg_tsne[:, 1], c=labels_binary)
plt.title("t-SNE Visualization of S-BERT-KG Embeddings")
plt.show()