In [2]:
import json
import re
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering

In [9]:
import json

with open("Final_mapped_keywords_mapped.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Collect all unique technologies, skipping null/empty
tech_set = set()
for entry in data:
    technologies = entry.get("Technologies")
    if technologies:
        tech_set.update(technologies)

# Convert to a sorted list
terms = sorted(tech_set)
print(f"Total unique technologies: {len(terms)}")

Total unique technologies: 7251


In [19]:
import re

def normalize(term: str) -> str:
    t = term.lower()
    t = re.sub(r"[^a-z0-9 ]+", " ", t)  
    t = re.sub(r"\s+", " ", t)           
    return t.strip()

# Normalize all terms
normed = [normalize(t) for t in terms]

# Filter out any terms that normalize to an empty string
filtered = [(orig, n) for orig, n in zip(terms, normed) if n]
terms_filt, normed_filt = zip(*filtered)

print(f"Filtered out {len(terms) - len(terms_filt)} empty‐norm terms; remaining {len(terms_filt)}")

Filtered out 0 empty‐norm terms; remaining 7251


In [21]:
# Vectorize the filtered normalized strings with TF‑IDF

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(normed_filt)

print("TF‑IDF matrix shape:", X.shape)

TF‑IDF matrix shape: (7251, 5019)


In [23]:
import numpy as np

row_nnz = X.getnnz(axis=1)
mask = row_nnz > 0
zero_count = np.sum(~mask)
print(f"Filtered out {zero_count} zero‑vector terms; clustering on {mask.sum()} terms")


terms_cluster = [terms_filt[i]    for i in range(len(terms_filt))    if mask[i]]
normed_cluster = [normed_filt[i]   for i in range(len(normed_filt))   if mask[i]]
X_cluster      = X[mask].toarray()

Filtered out 14 zero‑vector terms; clustering on 7237 terms


In [None]:
from sklearn.cluster import AgglomerativeClustering

clusterer = AgglomerativeClustering(
    n_clusters=None,
    metric="cosine",       
    linkage="average",
    distance_threshold=0.8  
)
labels = clusterer.fit_predict(X_cluster)

print(f"Found {labels.max() + 1} clusters")

In [28]:
# Chunk 6: Build the canonical mapping, including zero‑vector terms

from collections import defaultdict

# Group clustered terms by label
clusters = defaultdict(list)
for term, lbl, norm in zip(terms_cluster, labels, normed_cluster):
    clusters[lbl].append((term, norm))

# Map each original to its cluster representative
canonical = {}
for lbl, members in clusters.items():
    rep = min(members, key=lambda x: len(x[1]))[0]
    for orig, _ in members:
        canonical[orig] = rep

# For zero-vector terms, map them to themselves
for orig in terms_filt:
    if orig not in canonical:
        canonical[orig] = orig

# Example sanity checks
print("  'Firebase Realtime Database' →", canonical.get("Firebase Realtime Database"))
print("  'Python scripts'              →", canonical.get("Python scripts"))

  'Firebase Realtime Database' → Firebase Realtime Database
  'Python scripts'              → None


In [None]:
import pandas as pd

# Build a DataFrame of Original, Normalized, and Canonical columns
records = []
for orig in terms_filt:
    records.append({
        "Original Term": orig,
        "Normalized": normalize(orig),
        "Canonical": canonical.get(orig, orig)
    })

df = pd.DataFrame(records)

# Write out to Excel
out_path = "tech_mapping_threshold_0.8.xlsx"
df.to_excel(out_path, index=False)

print(f"Wrote mapping to {out_path}")

In [None]:
import json
import pandas as pd

excel_path = "tech_mapping_threshold_0.8.xlsx"
df_map = pd.read_excel(excel_path)

# Original Term → Canonical
mapping = dict(zip(df_map["Original Term"], df_map["Canonical"]))

with open("Final_mapped_keywords_mapped.json", "r", encoding="utf-8") as f:
    data = json.load(f)

for entry in data:
    techs = entry.get("Technologies")
    if techs:
        entry["Technologies"] = [mapping.get(t, t) for t in techs]

# Save out the cleaned JSON
out_path = "Clusted_technologies_keywords.json"
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

print(f"Wrote clustered/cleaned file to {out_path}")

In [None]:
import json

with open("Clusted_technologies_keywords.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Collect all unique technologies
tech_set = set()
for entry in data:
    technologies = entry.get("Technologies") or []
    tech_set.update(technologies)

print(f"Total unique technologies: {len(tech_set)}")