In [1]:
!pip install pyvis

Collecting pyvis
  Downloading pyvis-0.3.2-py3-none-any.whl (756 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m756.0/756.0 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting jedi>=0.16 (from ipython>=5.3.0->pyvis)
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jedi, pyvis
Successfully installed jedi-0.19.1 pyvis-0.3.2


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import json

# Load data from JSON file
with open('/content/drive/MyDrive/Upwork_collaborations/itg_db_public_term_embeddings.json', 'r') as file:
    data = json.load(file)

In [4]:
import json
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
from transformers import BertTokenizer, BertModel
import torch
import string

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Text Preprocessing
def preprocess(text):
    return text.translate(str.maketrans('', '', string.punctuation)).lower()

# Preprocess terms
processed_data = [(item['id'], item['label'], preprocess(item['term'])) for item in data]

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
model = BertModel.from_pretrained('bert-large-uncased').to(device)

# Function to get BERT embeddings
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt')
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
      outputs = model(**inputs)

    return outputs.last_hidden_state.mean(dim=1).cpu().detach().numpy()

# Get BERT embeddings for each term
term_vectors = np.array([get_bert_embedding(term[2]) for term in processed_data]).squeeze()

# Normalize the vectors (important for cosine similarity)
normalized_vectors = normalize(term_vectors)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

In [5]:
# KNN using cosine similarity
knn = NearestNeighbors(n_neighbors=10, metric='cosine')
knn.fit(normalized_vectors)

In [6]:
# Function to find nearest neighbors for a given term
def find_nearest(term, n_neighbors=10):
    query_vector = get_bert_embedding(term).squeeze()
    query_vector_normalized = normalize(query_vector.reshape(1, -1))
    distances, indices = knn.kneighbors(query_vector_normalized, n_neighbors=n_neighbors)

    results = []
    for i in range(len(indices[0])):
        idx = indices[0][i]
        id, label, term = processed_data[idx]
        results.append((id, label, term, distances[0][i]))

    return results

In [7]:
import spacy

# Adding an option for tf-idf based stop words generation
from sklearn.feature_extraction.text import TfidfVectorizer

def generate_stop_words_tfidf(corpus, max_features=50):
    """ Generate stop words using TF-IDF method """
    vectorizer = TfidfVectorizer(stop_words='english', max_features=max_features)
    X = vectorizer.fit_transform(corpus)
    indices = np.argsort(vectorizer.idf_)[::-1]
    features = vectorizer.get_feature_names_out()
    top_features = [features[i] for i in indices]
    return set(top_features)

In [8]:
all_text = " ".join([node['term'] for node in data])
stop_words = generate_stop_words_tfidf([all_text])

In [None]:
list(stop_words)

In [9]:
def normalize_text_extended(text, custom_stop_words=None, use_tfidf=True, corpus=None):
    # Load Spacy Model
    nlp = spacy.load("en_core_web_sm")

    if corpus is not None and use_tfidf:
        stop_words = generate_stop_words_tfidf(corpus)
    else:
        stop_words = set()  # Or use some default set of stop words

    # print(stop_words)

    # Adding custom stop words if provided
    if custom_stop_words is not None:
        stop_words.update(custom_stop_words)

    # Parse the sentence
    doc = nlp(text)

    # Lemmatize, remove punctuation and stop words
    lemmatized = [token.lemma_ for token in doc if token.text not in string.punctuation and token.text not in stop_words]

    # Reconstruct the sentence
    normalized_sentence = ' '.join(lemmatized)

    return normalized_sentence

In [10]:
import random

def generate_distinct_colors(num_colors):
    colors = []
    for _ in range(num_colors):
        color = "#{:06x}".format(random.randint(0, 0xFFFFFF))
        colors.append(color)
    return colors

In [11]:
import re

def filter_strings(input_strings):
  pattern = re.compile(r'\b\d+x\d+\b|\d+|^.{1,2}$')

  filtered_strings = [s for s in input_strings if not pattern.match(s)]
  return filtered_strings

In [12]:
strings = ['33x','2abbxvb1', "44.x24","333",'ab',"1000x540x3343", "dentist", "ali","strategy", "pak21"]
print(filter_strings(strings))

['dentist', 'ali', 'strategy', 'pak21']


In [13]:
import networkx as nx
G = nx.DiGraph()

graph_data = processed_data
labels = set([label[1] for label in graph_data])

# Generate distinctive colors for nodes
distinct_colors = generate_distinct_colors(len(labels))

label_color = {label: color for label, color in zip(labels, distinct_colors)}

# for i, node in enumerate(graph_data):
    # term = normalize_text_extended(node[2], custom_stop_words= stop_words)
    # term = filter_strings([node[2]])
    # term_text = " ".join(term).strip()

    # if len(term_text)>2:
      # G.add_node(str(node[2]), n_id = str(node[0]), color=label_color[node[1]])

for n_id, each in enumerate(graph_data):
    term = filter_strings(each[2].split(" "))
    term_text = " ".join(term).strip()

    if len(term_text) > 0:
      # Add nodes with colors to the graph
      G.add_node(each[2], color=label_color[each[1]])

      # Add edges with weights
      neighbors = find_nearest(term_text)

      n_processed = normalize_text_extended(" ".join([n[2] for n in neighbors]), custom_stop_words= stop_words)

      for idx, n in enumerate(n_processed.split(" ")):
        filtered_neighbor = filter_strings(n.split(" "))
        final_n = " ".join(filtered_neighbor).strip()
        if len(final_n) > 0:

          if idx == 0:
            G.add_edge(term_text, final_n, weight=3)
          elif idx == 1:
            G.add_edge(term_text, final_n, weight=2)
          elif idx == 2:
            G.add_edge(term_text, final_n, weight=1)
          else:
            G.add_edge(term_text, final_n, weight=1)

for u, v, d in G.edges(data=True):
  if G.has_edge(v, u):
    total_weight = G[u][v]['weight'] + G[v][u]['weight']
    G[u][v]['weight'] = total_weight
    G[v][u]['weight'] = total_weight

scale=3 # Scaling the size of the nodes by 10*degree
d = dict(G.degree)

#Updating dict
d.update((x, scale*y) for x, y in d.items())

#Setting up size attribute
nx.set_node_attributes(G,d,'size')

In [14]:
from pyvis.network import Network

net = Network(height='625px',
        width = "100%",
        bgcolor='#222222',
        font_color='white',
        # cdn_resources='inline',
    )

net.from_nx(G)

net.set_options("""{
    "nodes": {
            "borderWidth": 0,
            "color": {
            "border": "#2B7CE9",
            "background": "#97C2FC",
            "highlight": {
                    "border": "#66fa89",
                    "background": "#66fa89"
            }
            },
            "font": {
            "size": 15,
            "face": "verdana"
            },
            "scaling": {
            "min": 89
            }
    },
    "edges": {
            "color": {
            "inherit": false
            },
            "selfReference": {
            "angle": 0.7853981633974483
            },
            "smooth": {
            "forceDirection": "none"
            }
    },
    "interaction": {
            "hover": true,
            "hoverConnectedEdges": true,
            "multiselect": false,
            "navigationButtons": true
    },
    "physics": {
            "forceAtlas2Based": {
            "centralGravity": 0.05,
            "springLength": 100
            },
            "minVelocity": 0.75,
            "solver": "forceAtlas2Based"
    }
    }
    """)

In [17]:
path = '/content/drive/MyDrive/Upwork_collaborations'
# net.save_graph(f'{path}/pyvis_graph.html')
net.save_graph('pyvis_graph.html')

In [18]:
net.save_graph(f'{path}/pyvis_graph_20_jan.html')
