<a href="https://colab.research.google.com/github/tlokeshkumar1/nlp/blob/master/one.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from retrofitting import Retrofitting

# Define the sets of tweets and labels
X = pd.concat([pd.read_csv('D11.csv'), pd.read_csv('D2.csv')])['hashtags'].values
Advice = ["['Stay at home']","['wash hands']","['wear mask']","['social distancing']"]
China = ["['Wuhan']","['China Coronavirus Updates']","['China news']","['other tweets related to China']"]
Mask = ["['Mask shortage']","['wear mask']","['mask types']","['N50']","['N95']","['3M8210']","['3M9001']","['3M9322']","['3M9501']"]
News = ["['Coronavirus updates']","['news']","['rules']"]
Transportation = ["['Flights']","['traffic']","['traveling']"]
USA = ["['U.S. Coronavirus Updates']","['COVID19']","['U.S. news']","['United States']","['US']","['USA']"]
Vaccine = ["['Vaccine news']","['vaccine progress']","['vaccine injection']"]
L = Advice + China + Mask + News + Transportation + USA + Vaccine

# Initialize a pre-trained BERT model
model = SentenceTransformer('bert-base-nli-mean-tokens')

# Embed the sentences and labels
X_emb = model.encode(X)
L_emb = model.encode(L)

# Load the external knowledge graph (ConceptNet)
cn_data = pd.read_csv('conceptnet.csv', header=None, sep='\t', names=['start', 'relation', 'end'])

# Create the retrofitting model
retro = Retrofitting(model=model, cn_data=cn_data, l2_normalize=True, use_all_singular_values=True)

# Retrfoit the sentence and label embeddings to the knowledge graph space
X_emb_kg = retro.fit_transform(X_emb)
L_emb_kg = retro.fit_transform(L_emb)

# Define the labels for the tweets
labels = np.array(['Vaccine', 'USA', 'Transportation', 'News', 'Mask', 'China', 'Advice'])

# Classify the tweets based on the cosine similarity between embeddings
predictions = []
for x in X_emb_kg:
    cosine_sim = np.dot(x, L_emb_kg.T)
    prediction = labels[np.argmax(cosine_sim)]
    predictions.append(prediction)

# Print the predicted labels
print(predictions)


In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Define the tweet categories
Advice = ["['Stay at home']","['wash hands']","['wear mask']","['social distancing']"]
China = ["['Wuhan']","['China Coronavirus Updates']","['China news']","['other tweets related to China']"]
Mask = ["['Mask shortage']","['wear mask']","['mask types']","['N50']","['N95']","['3M8210']","['3M9001']","['3M9322']","['3M9501']"]
News = ["['Coronavirus updates']","['news']","['rules']"]
Transportation = ["['Flights']","['traffic']","['traveling']"]
USA = ["['U.S. Coronavirus Updates']","['COVID19']","['U.S. news']","['United States']","['US']","['USA']"]
Vaccine = ["['Vaccine news']","['vaccine progress']","['vaccine injection']"]

# Create a dataframe with all the tweets
tweets_df = pd.read_csv('tweets1.csv').append(pd.read_csv('tweets2.csv'))
tweets_df.reset_index(inplace=True, drop=True)

# Extract the tweet text from the hashtags column
tweets_df['text'] = tweets_df['hashtags'].str.extract(r"'(.*?)'").fillna('')

# Define the S-BERT model
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

# Compute embeddings for all the tweets
tweet_embeddings = sbert_model.encode(tweets_df['text'])

# Compute embeddings for all the tweet categories
category_embeddings = sbert_model.encode(Advice + China + Mask + News + Transportation + USA + Vaccine)

# Compute the cosine similarity between each tweet embedding and each category embedding
similarity_matrix = cosine_similarity(tweet_embeddings, category_embeddings)

# Assign each tweet to the category with the highest cosine similarity
predicted_labels = []
for i in range(len(tweets_df)):
    predicted_label_index = similarity_matrix[i].argmax()
    predicted_label = ['Vaccine','USA','Transportation','News','Mask','China','Advice'][predicted_label_index]
    predicted_labels.append(predicted_label)

# Add the predicted labels to the dataframe
tweets_df['predicted_label'] = predicted_labels


In [None]:
import numpy as np
import scipy.sparse as sp
from retrofitting import Retrofitter

# Load pretrained word embeddings (e.g., GloVe)
word_vectors = np.load('glove_word_vectors.npy')
word_vocab = np.load('glove_word_vocab.npy')

# Load ConceptNet edges
edges = []
with open('conceptnet_edges.txt', 'r') as f:
    for line in f:
        line = line.strip().split('\t')
        edges.append((line[1], line[0], float(line[2])))  # reverse edge direction for retrofitting

# Convert edges to sparse matrix
num_words = len(word_vocab)
edge_indices = {}
edge_data = []
for i, (u, v, w) in enumerate(edges):
    if u in word_vocab and v in word_vocab:
        j = word_vocab.index(u)
        k = word_vocab.index(v)
        if j != k:
            if (j, k) not in edge_indices:
                edge_indices[(j, k)] = len(edge_indices)
                edge_data.append(0.0)
            edge_data[edge_indices[(j, k)]] += w
edge_indices = np.array(list(edge_indices.keys())).T
edge_data = np.array(edge_data)
edge_matrix = sp.csr_matrix((edge_data, edge_indices), shape=(num_words, num_words))

# Retrofit word embeddings with ConceptNet
retrofitter = Retrofitter(word_vectors, edge_matrix)
retrofitted_vectors = retrofitter.fit(alpha=0.5, beta=0.5, num_iters=10)

# Save retrofitted word embeddings
np.save('retrofitted_word_vectors.npy', retrofitted_vectors)


In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
from sklearn.linear_model import LinearRegression

# Initialize S-BERT model and GloVe embeddings
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens")
model = AutoModel.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens")
glove_embeddings = {}

# Load GloVe embeddings
with open('glove.6B.300d.txt', 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        glove_embeddings[word] = coefs

# Define knowledge graph embedding function
def get_knowledge_graph_embeddings(vocab):
    knowledge_graph = {}
    with open('conceptnet_embeddings.txt', 'r', encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            if word in vocab:
                coefs = np.asarray(values[1:], dtype='float32')
                knowledge_graph[word] = coefs
    return knowledge_graph

# Define function to get embeddings for a list of words
def get_embeddings(words):
    embeddings = []
    for word in words:
        if word in glove_embeddings:
            embeddings.append(glove_embeddings[word])
        else:
            embeddings.append(np.zeros(300)) # use 0 vectors for out-of-vocabulary words
    return torch.tensor(embeddings)

# Define function to learn projection matrix
def learn_projection_matrix(vocab):
    # Get knowledge graph embeddings for vocab
    QV = get_knowledge_graph_embeddings(vocab)
    # Get S-BERT embeddings for vocab
    input_ids = tokenizer(list(vocab), return_tensors="pt", padding=True).input_ids
    fV = model(input_ids).pooler_output.detach().numpy()
    # Learn projection matrix
    reg = LinearRegression().fit(fV, list(QV.values()))
    P = reg.coef_.T
    return P

# Define function to get embeddings for a list of labels
def get_label_embeddings(labels, P):
    # Get S-BERT embeddings for labels
    input_ids = tokenizer(list(labels), return_tensors="pt", padding=True).input_ids
    f_labels = model(input_ids).pooler_output.detach().numpy()
    # Project S-BERT embeddings into knowledge graph embedding space
    f_labels = np.dot(f_labels, P)
    return f_labels

# Define function to predict labels for a list of tweets
def predict_labels(tweets, labels, P):
    # Get S-BERT embeddings for tweets
    input_ids = tokenizer(list(tweets), return_tensors="pt", padding=True).input_ids
    f_tweets = model(input_ids).pooler_output.detach().numpy()
    # Project S-BERT embeddings into knowledge graph embedding space
    f_tweets = np.dot(f_tweets, P)
    # Get label embeddings and calculate cosine similarity with tweets
    f_labels = get_label_embeddings(labels, P)
    cos_sim = np.dot(f_tweets, f_labels.T) / (np.linalg.norm(f_tweets, axis=1)[:, np.newaxis] * np.linalg.norm(f_labels, axis=1))
    # Get predicted labels
    predicted_labels = [labels[i] for i in np.argmax(cos_sim, axis=1)]
    return predicted_labels
