##### code to check different intermediate topics (or next_keyword) selected using different methods.

In [None]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
import requests
import re
import time
import nltk
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [None]:
df = pd.read_csv("/content/drive/MyDrive/BTech_Project/analysis1.csv")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
def get_bert_embedding(word):
    tokens = tokenizer(word, return_tensors='pt')

    with torch.no_grad():
        outputs = model(**tokens)

    embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
    return embedding

def cosine_similarity(word1, word2):
    embedding1 = get_bert_embedding(word1)
    embedding2 = get_bert_embedding(word2)

    similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1)*np.linalg.norm(embedding2))

    return similarity

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
def get_relatedness(word1, word2):
    url = f"http://api.conceptnet.io/relatedness?node1=/c/en/{word1}&node2=/c/en/{word2}"
    response = requests.get(url)
    if(response.status_code==200):
        relatedness = response.json().get("value")
        return relatedness
    print(word1, word2)
    return 0

In [None]:
def get_related_keywords_hop1(kw):
    related_kws = set()
    related_kws.add(kw)
    offset = 0
    while(1):
        url = f"http://api.conceptnet.io/c/en/{kw}?limit=1000&offset={offset}"

        response = requests.get(url)

        if response.status_code == 200:
            data = response.json()
            edges = data.get("edges", [])

            if(not edges):
                break

            for edge in edges:
                start_word = edge.get("start", {}).get("label")
                start_lang = edge.get("start", {}).get("language", "")

                end_word = edge.get("end", {}).get("label")
                end_lang = edge.get("end", {}).get("language", "")

                if start_lang != "en" or end_lang != "en":
                    continue
                connected_word = start_word if start_word!=kw else end_word

                if(len(re.split(' |-', connected_word)) != 1):
                    continue

                if connected_word not in stop_words:
                    related_kws.add(connected_word.lower())

        offset = offset + 1000
    return related_kws


def get_related_keywords_hop2(related_kws_hop1):

    related_kws_hop2 = set()

    for i in related_kws_hop1:
        related_kws_hop2.add(i)

        offset = 0
        while(1):
            url = f"http://api.conceptnet.io/c/en/{i}?limit=1000&offset={offset}"

            response = requests.get(url)

            count = 1
            if response.status_code == 200:
                data = response.json()
                edges = data.get("edges", [])

                if(not edges):
                    break

                for edge in edges:
                    start_word = edge.get("start", {}).get("label")
                    start_lang = edge.get("start", {}).get("language", "")

                    end_word = edge.get("end", {}).get("label")
                    end_lang = edge.get("end", {}).get("language", "")

                    if start_lang != "en" or end_lang != "en":
                        continue
                    connected_word = start_word if start_word!=i else end_word
                    if(len(re.split(' |-', connected_word)) != 1):
                        continue
                    related_kws_hop2.add(connected_word.lower())
                    count += 1

            offset = offset + 1000

    return related_kws_hop2

In [None]:
def next_keyword_cossim(candidate_kws, target_kw):
    
    kw_simvalues = []
    for word in candidate_kws:
        curr_sim = cosine_similarity(word, target_kw)
        kw_simvalues.append((word, curr_sim))

    sorted_list = sorted(kw_simvalues, key=lambda x: x[1], reverse=True)

    if(sorted_list):
        return sorted_list[0][0]
    else:
        return ""

def next_keyword_relatedness(candidate_kws, target_kw):

    kw_simvalues = []
    for word in candidate_kws:
        time.sleep(0.8)
        curr_sim = get_relatedness(word, target_kw)
        kw_simvalues.append((word, curr_sim))

    sorted_list = sorted(kw_simvalues, key=lambda x: x[1], reverse=True)

    if(sorted_list):
        return sorted_list[0][0]
    else:
        return ""

In [None]:
og_cossim = []
og_relatedness = []
intersection = []
cossim = []
relatedness = []

for index, row in df.iterrows():
    rw1 = get_related_keywords_hop1(row[0])
    rw2 = get_related_keywords_hop1(row[1])
    intersecting_words = rw1.intersection(rw2)

    if intersecting_words:
        intersection.append(str(len(intersecting_words)) + " (1-1)")
    else:
        rw2 = get_related_keywords_hop2(rw2)
        intersecting_words = rw1.intersection(rw2)
        if intersecting_words:
            intersection.append(str(len(intersecting_words)) + " (1-2)")
        else:
            intersection.append("0 (1-2)")

    og_cossim.append(next_keyword_cossim(rw1, row[1]))
    og_relatedness.append(next_keyword_relatedness(rw1, row[1]))
    cossim.append(next_keyword_cossim(intersecting_words, row[1]))
    relatedness.append(next_keyword_relatedness(intersecting_words, row[1]))

df["og_cossim"] = og_cossim
df["og_relatedness"] = og_relatedness
df["intersection"] = intersection
df["cossim"] = cossim
df["relatedness"] = relatedness

df.to_csv("/content/drive/MyDrive/BTech_Project/analysis.csv")

0
1
2
3
