<a href="https://colab.research.google.com/github/tlokeshkumar1/nlp/blob/master/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
pip install torch-geometric

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
#Python code for S-BERT-KG embedding alignment
from sentence_transformers import SentenceTransformer
import torch
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
import pandas as pd

#Load the data from two csv files and merge them into a single dataframe
RESULTS_DIR = 'drive/MyDrive'
tweets_df1 = pd.read_csv(RESULTS_DIR + '/D11.csv', encoding='latin1', engine='python')
tweets_df2 = pd.read_csv(RESULTS_DIR + '/D2.csv', encoding='latin1', engine='python')
df = pd.concat([tweets_df1, tweets_df2], ignore_index=True)

# Load pre-trained S-BERT model
model = SentenceTransformer('bert-base-nli-mean-tokens')

# Define a function to embed a text using S-BERT
def embed_text(text):
    # Embed the text using S-BERT
    if isinstance(text, str):
        embeddings = model.encode(text)
    else:
        embeddings = model.encode([t for t in text if isinstance(t, str)])
    # Convert the embeddings to a PyTorch tensor
    tensor = torch.FloatTensor(embeddings)
    # Return the tensor
    return tensor

# Define a function to create a PyTorch Geometric graph from a knowledge graph
def create_kg_graph(kg, num_nodes):
    # Get the nodes and edges from the knowledge graph
    nodes = list(kg.keys())
    edges = []
    for node1 in kg.keys():
        for node2 in kg[node1]:
            if isinstance(node2, list):
                node2 = ', '.join(node2)
            if node1 in nodes and node2 in nodes:
                edges.append((nodes.index(node1), nodes.index(node2)))
    # Add dummy nodes to the graph so that it has the same number of nodes as the size of the weight tensor
    for i in range(num_nodes - len(nodes)):
        nodes.append(f'dummy{i}')
        for j in range(len(nodes)-1):
            edges.append((j, len(nodes)-1))
    # Create the PyTorch Geometric graph data object
    data = Data(x=torch.eye(len(nodes)), edge_index=torch.LongTensor(edges).t())
    # Return the graph data object
    return data

# Define a knowledge graph
kg = {
    'tweet': ['Advice','China','Mask','News','Transportation','USA','Vaccine'],
    'Advice': ['Stay at home','wash hands','wear mask','social distancing'],
    'China': ['Wuhan','China Coronavirus Updates','China news','other tweets related to China'],
    'Mask': ['Mask shortage','wear mask','mask types','N50','N95','3M8210','3M9001','3M9322','3M9501'],
    'News': ['Coronavirus updates','news','rules'],
    'Transportation': ['Flights','traffic','traveling'],
    'USA': ['U.S. Coronavirus Updates','COVID19','U.S. news','United States','US','USA'],
    'Vaccine': ['Vaccine news','vaccine progress','vaccine injection'],
}

# Define a GCN model
class GCN(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, out_channels)
        self.conv2 = GCNConv(out_channels, out_channels)

    def forward(self, x, edge_index):
        # Add dummy nodes to x so that it has the same number of nodes as edge_index has edges
        x = torch.cat([x, torch.zeros((edge_index.size(1)-x.size(0), x.size(1)), device=x.device)], dim=0)
        x = self.conv1(x, edge_index)
        x = torch.nn.functional.relu(x)
        # Remove the dummy nodes from the output tensor
        x = x[:x.size(0)-edge_index.size(1)]
        x = self.conv2(x, edge_index)
        return x

# Define a function to train the GCN model
def train_gcn(model, data, embeddings, epochs):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    criterion = torch.nn.MSELoss()
    for epoch in range(epochs):
        optimizer.zero_grad()
        output = model(data.x, data.edge_index)
        loss = criterion(output, embeddings)
        loss.backward()
        optimizer.step()
        print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

# Embed the text and create the PyTorch Geometric graph data object
text = df["hashtags"].apply(lambda x: ', '.join(x.strip("[]").split(", ")) if isinstance(x, str) else '')

embeddings = embed_text(text)
# Reshape the embeddings tensor to have shape (num_tweets + num_dummy_nodes, embedding_size)
num_dummy_nodes = embeddings.size(0) - len(kg)
embeddings = embeddings.reshape(len(kg) + num_dummy_nodes, -1)
graph_data = create_kg_graph(kg, embeddings.size(0))

# Train the GCN model
model = GCN(embeddings.shape[1], 64)
train_gcn(model, graph_data, embeddings, 100)

# Get the aligned embeddings
aligned_embeddings = model(graph_data.x, graph_data.edge_index)
print(aligned_embeddings)

In [None]:
#python code for S-BERT-KG zero shot text classification
from sentence_transformers import SentenceTransformer
import torch
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
import pandas as pd

# Load pre-trained S-BERT model
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

# Define a function to embed a text using S-BERT
def embed_text(text):
    # Embed the text using S-BERT
    embeddings = model.encode(text)
    # Convert the embeddings to a PyTorch tensor
    tensor = torch.FloatTensor(embeddings)
    # Return the tensor
    return tensor

def embed_node(node):
    # Embed a node using the S-BERT model
    return sbert_model.encode(node) 

# Define a function to create a PyTorch Geometric graph from a knowledge graph
def create_kg_graph(kg):
    # Get the nodes and edges from the knowledge graph
    nodes = []
    edges = []
    for node1 in kg.keys():
        if node1 not in nodes:
            nodes.append(node1)
        for node2 in kg[node1]:
            if node2 not in nodes:
                nodes.append(node2)
            edges.append((nodes.index(node1), nodes.index(node2)))
    # Embed the nodes using the S-BERT model
    embeddings = [embed_node(node) for node in nodes]
    embeddings = [torch.from_numpy(embedding) for embedding in embeddings]
    embeddings = torch.stack(embeddings)
    # Create the PyTorch Geometric graph data object
    data = Data(x=embeddings, edge_index=torch.LongTensor(edges).transpose(0, 1))
    # Return the graph data object
    return data

class GCN(torch.nn.Module):
    def __init__(self, input_size, hidden_size):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_size, hidden_size)
        self.conv2 = GCNConv(hidden_size, hidden_size)
        self.lin = torch.nn.Linear(hidden_size, input_size)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = torch.nn.functional.relu(x)
        x = self.conv2(x, edge_index)
        x = self.lin(x)
        return x

# Define a function to train a GCN model on a knowledge graph and return the aligned embeddings
def train_gcn(kg, text):
    # Embed the text and create the PyTorch Geometric graph data object
    embeddings = embed_text(text)
    graph_data = create_kg_graph(kg)
    # Define a GCN model and train it on the embeddings and graph data
    model = GCN(embeddings.shape[1], 768)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    criterion = torch.nn.MSELoss()
    for epoch in range(100):
        optimizer.zero_grad()
        output = model(graph_data.x, graph_data.edge_index)
        loss = criterion(output, embeddings)
        loss.backward()
        optimizer.step()
    # Embed the nodes using the trained GCN model
    aligned_embeddings = model(graph_data.x, graph_data.edge_index)
    # Return the aligned embeddings
    return aligned_embeddings

# Define a function to classify a text using a knowledge graph and aligned embeddings
def classify_text(kg, text, target_embeddings):
    # Create the PyTorch Geometric graph data object
    graph_data = create_kg_graph(kg)
    # Check if the number of nodes in the knowledge graph is greater than or equal to the number of labels
    if graph_data.x.shape[0] >= target_embeddings.shape[0]:
        return list(kg.keys())[0]
    # Create label embeddings that match the size of the target embeddings
    zero_vectors = torch.zeros(target_embeddings.shape[0] - graph_data.x.shape[0], graph_data.x.shape[1])
    label_embeddings = torch.cat([graph_data.x, zero_vectors], dim=0)
    # Get the aligned embeddings
    aligned_embeddings = train_gcn(kg, text)
    # Compute the cosine similarities between the aligned embeddings and the label embeddings
    similarities = torch.nn.functional.cosine_similarity(aligned_embeddings, label_embeddings)
    # Get the index of the label with the highest similarity
    label_index = similarities.argmax().item()
    # Return the predicted label
    predicted_label = list(kg.keys())[label_index]
    return predicted_label

# Define a knowledge graph
kg = {
    'tweet': ['Advice','China','Mask','News','Transportation','USA','Vaccine'],
    'Advice': ['Stay at home','wash hands','wear mask','social distancing'],
    'China': ['Wuhan','China Coronavirus Updates','China news','other tweets related to China'],
    'Mask': ['Mask shortage','wear mask','mask types','N50','N95','3M8210','3M9001','3M9322','3M9501'],
    'News': ['Coronavirus updates','news','rules'],
    'Transportation': ['Flights','traffic','traveling'],
    'USA': ['U.S. Coronavirus Updates','COVID19','U.S. news','United States','US','USA'],
    'Vaccine': ['Vaccine news','vaccine progress','vaccine injection'],
}
labels = embed_text(['Advice','China','Mask','News','Transportation','USA','Vaccine'])

#Load the data from two csv files and merge them into a single dataframe
RESULTS_DIR = 'drive/MyDrive'
tweets_df1 = pd.read_csv(RESULTS_DIR + '/D11.csv', encoding='latin1', engine='python')
tweets_df2 = pd.read_csv(RESULTS_DIR + '/D2.csv', encoding='latin1', engine='python')
df = pd.concat([tweets_df1, tweets_df2], ignore_index=True)

# Classify a text using the knowledge graph and label embeddings
text = df["hashtags"].apply(lambda x: ', '.join(x.strip("[]").split(", ")) if isinstance(x, str) else '')
predicted_label = classify_text(kg, text, labels)
print(predicted_label)

tweet


In [None]:
#Fixed code for S-BERT-KG embedding alignment
from sentence_transformers import SentenceTransformer
import torch
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
import pandas as pd

#Load the data from two csv files and merge them into a single dataframe
RESULTS_DIR = 'drive/MyDrive'
tweets_df1 = pd.read_csv(RESULTS_DIR + '/D11.csv', encoding='latin1', engine='python')
tweets_df2 = pd.read_csv(RESULTS_DIR + '/D2.csv', encoding='latin1', engine='python')
df = pd.concat([tweets_df1, tweets_df2], ignore_index=True)

#Load pre-trained S-BERT model
model = SentenceTransformer('bert-base-nli-mean-tokens')

#Define a function to embed a text using S-BERT
def embed_text(text):
  # Embed the text using S-BERT
  if isinstance(text, str):
      embeddings = model.encode(text)
  else:
    embeddings = model.encode([t for t in text if isinstance(t, str)])
    # Convert the embeddings to a PyTorch tensor
    tensor = torch.FloatTensor(embeddings)
  # Return the tensor
  return tensor

#Define a function to create a PyTorch Geometric graph from a knowledge graph
def create_kg_graph(kg):
  # Get the nodes and edges from the knowledge graph
  nodes = list(kg.keys())
  edges = []
  for node1 in kg.keys():
    for node2 in kg[node1]:
      if isinstance(node2, list):
        node2 = ', '.join(node2)
      if node1 in nodes and node2 in nodes:
        edges.append((nodes.index(node1), nodes.index(node2)))
        # Create the PyTorch Geometric graph data object
        data = Data(x=torch.eye(len(nodes)), edge_index=torch.LongTensor(edges).t())
  # Return the graph data object
  return data

#Define a knowledge graph
kg = {
'tweet': ['Advice','China','Mask','News','Transportation','USA','Vaccine'],
'Advice': ['Stay at home','wash hands','wear mask','social distancing'],
'China': ['Wuhan','China Coronavirus Updates','China news','other tweets related to China'],
'Mask': ['Mask shortage','wear mask','mask types','N50','N95','3M8210','3M9001','3M9322','3M9501'],
'News': ['Coronavirus updates','news','rules'],
'Transportation': ['Flights','traffic','traveling'],
'USA': ['U.S. Coronavirus Updates','COVID19','U.S. news','United States','US','USA'],
'Vaccine': ['Vaccine news','vaccine progress','vaccine injection'],
}

#Define a GCN model
class GCN(torch.nn.Module):
  def init(self, in_channels, out_channels):
    super(GCN, self).init()
    self.conv1 = GCNConv(in_channels, out_channels)
    self.conv2 = GCNConv(out_channels, out_channels)

  def forward(self, x, edge_index):
    x = self.conv1(x, edge_index)
    x = torch.nn.functional.relu(x)
    x = self.conv2(x, edge_index)
    return x

#Define a function to train the GCN model
def train_gcn(model, data, embeddings, epochs):
  optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
  criterion = torch.nn.MSELoss()
  for epoch in range(epochs):
    optimizer.zero_grad()
    output = model(data.x, data.edge_index)
    loss = criterion(output, embeddings)
    loss.backward()
    optimizer.step()
  print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

#Embed the text and create the PyTorch Geometric graph data object
text = df["hashtags"].apply(lambda x: ', '.join(x.strip("[]").split(", ")) if isinstance(x, str) else '')

embeddings = embed_text(text)

#Reshape the embeddings tensor to have shape (num_tweets, embedding_size)
embeddings = embeddings.reshape(-1, embeddings.shape[-1])
graph_data = create_kg_graph(kg)

#Train the GCN model
model = GCN(embeddings.shape[1], 64)
train_gcn(model, graph_data, embeddings, 100)

#Get the aligned embeddings
aligned_embeddings = model(graph_data.x, graph_data.edge_index)
print(aligned_embeddings)

TypeError: ignored

In [None]:
pip install --upgrade tokenizers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
pip install --upgrade transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

#Load the pre-trained S-BERT model
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens")
model = AutoModel.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens")

#Load the data from two csv files and merge them into a single dataframe
RESULTS_DIR = 'drive/MyDrive'
tweets_df1 = pd.read_csv(RESULTS_DIR + '/D11.csv', encoding='latin1', engine='python')
tweets_df2 = pd.read_csv(RESULTS_DIR + '/D2.csv', encoding='latin1', engine='python')
df = pd.concat([tweets_df1, tweets_df2], ignore_index=True)

#Read the string attributes of hashtags and convert them into lists
df["hashtags"] = df["hashtags"].apply(lambda x: x.strip("[]").split(", ") if isinstance(x, str) else [])

#Define the set of possible labels
labels = ["Vaccine", "USA", "Transportation", "News", "Mask", "China", "Advice"]

#Define the set of tweets to be categorized
tweets = df["hashtags"].tolist()

#Define the set of informative tweets for each label
advice = ["Stay at home", "wash hands", "wear mask", "social distancing"]
china = ["Wuhan", "China Coronavirus Updates", "China news", "other tweets related to China"]
mask = ["Mask shortage", "wear mask", "mask types", "N50", "N95", "3M8210", "3M9001", "3M9322", "3M9501"]
news = ["Coronavirus updates", "news", "rules"]
transportation = ["Flights", "traffic", "traveling"]
usa = ["U.S. Coronavirus Updates", "COVID19", "U.S. news", "United States", "US", "USA"]
vaccine = ["Vaccine news", "vaccine progress", "vaccine injection"]

#Define the set of informative sentences for each label
sentences = advice + china + mask + news + transportation + usa + vaccine

#Embed the sentences and labels using the pre-trained S-BERT model
sentence_embeddings = model(tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")).last_hidden_state.mean(dim=1)
label_embeddings = model(tokenizer(labels, padding=True, truncation=True, return_tensors="pt")).last_hidden_state.mean(dim=1)

#Embed the tweets using the pre-trained S-BERT model
tweet_embeddings = model(tokenizer(tweets, padding=True, truncation=True, return_tensors="pt")).last_hidden_state.mean(dim=1)

#Construct a knowledge graph embedding space with the RE method
knowledge_graph = np.concatenate((sentence_embeddings.detach().numpy(), label_embeddings.detach().numpy()), axis=0)
projection_matrix = np.linalg.pinv(knowledge_graph.T @ knowledge_graph) @ knowledge_graph.T @ tweet_embeddings.detach().numpy()

#Classify the tweets based on their cosine similarity with the label embeddings
for i, tweet in enumerate(tweets):
  tweet_embedding = tweet_embeddings[i].detach().numpy()
  label_scores = cosine_similarity(tweet_embedding.reshape(1, -1) @ projection_matrix, label_embeddings.detach().numpy() @ projection_matrix)
  predicted_labels = [labels[j] for j in np.argsort(label_scores[0])[::-1][:3]]
  print(f"Tweet: {tweet}")
  print(f"Predicted labels: {predicted_labels}\n")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


AttributeError: ignored

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

# Load the pre-trained BERT model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# Load the data from two csv files and merge them into a single dataframe
RESULTS_DIR = 'drive/MyDrive'
tweets_df1 = pd.read_csv(RESULTS_DIR + '/D11.csv', encoding='latin1', engine='python')
tweets_df2 = pd.read_csv(RESULTS_DIR + '/D2.csv', encoding='latin1', engine='python')
df = pd.concat([tweets_df1, tweets_df2], ignore_index=True)

# Read the string attributes of hashtags and convert them into lists
df["hashtags"] = df["hashtags"].apply(lambda x: x.strip("[]").split(", ") if isinstance(x, str) else [])

# Define the set of possible labels
labels = ["Vaccine", "USA", "Transportation", "News", "Mask", "China", "Advice"]

# Define the set of tweets to be categorized
tweets = df["hashtags"].tolist()

# Define the set of informative tweets for each label
advice = ["Stay at home", "wash hands", "wear mask", "social distancing"]
china = ["Wuhan", "China Coronavirus Updates", "China news", "other tweets related to China"]
mask = ["Mask shortage", "wear mask", "mask types", "N50", "N95", "3M8210", "3M9001", "3M9322", "3M9501"]
news = ["Coronavirus updates", "news", "rules"]
transportation = ["Flights", "traffic", "traveling"]
usa = ["U.S. Coronavirus Updates", "COVID19", "U.S. news", "United States", "US", "USA"]
vaccine = ["Vaccine news", "vaccine progress", "vaccine injection"]

# Define the set of informative sentences for each label
sentences = advice + china + mask + news + transportation + usa + vaccine

# Embed the sentences and labels using the pre-trained BERT model
sentence_embeddings = model(tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")).last_hidden_state.mean(dim=1)
label_embeddings = model(tokenizer(labels, padding=True, truncation=True, return_tensors="pt")).last_hidden_state.mean(dim=1)

# Embed the tweets using the pre-trained BERT model
tweet_embeddings = model(tokenizer(tweets, padding=True, truncation=True, return_tensors="pt")).last_hidden_state.mean(dim=1)

# Construct a knowledge graph embedding space with the RE method
knowledge_graph = np.concatenate((sentence_embeddings.detach().numpy(), label_embeddings.detach().numpy()), axis=0)
projection_matrix = np.linalg.pinv(knowledge_graph.T @ knowledge_graph) @ knowledge_graph.T @ tweet_embeddings.detach().numpy()

# Classify the tweets based on their cosine similarity with the label embeddings
for i, tweet in enumerate(tweets):
    tweet_embedding = tweet_embeddings[i].detach().numpy()
    label_scores = cosine_similarity(tweet_embedding.reshape(1, -1) @ projection_matrix, label_embeddings.detach().numpy() @ projection_matrix)
    predicted_labels = [labels[j] for j in np.argsort(label_scores[0])[::-1][:3]]
    print(f"Tweet: {tweet}")
    print(f"Predicted labels: {predicted_labels}\n")

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


AttributeError: ignored