In [2]:
import torch

print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

CUDA available: True
GPU: Tesla T4


In [1]:
!git clone https://github.com/tanisha0804/Industry-Academia-alignment.git

Cloning into 'Industry-Academia-alignment'...
remote: Enumerating objects: 35, done.[K
remote: Counting objects: 100% (35/35), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 35 (delta 5), reused 22 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (35/35), 2.23 MiB | 7.06 MiB/s, done.
Resolving deltas: 100% (5/5), done.


In [3]:
!pip install keybert sentence-transformers scikit-learn

Collecting keybert
  Downloading keybert-0.9.0-py3-none-any.whl.metadata (15 kB)
Downloading keybert-0.9.0-py3-none-any.whl (41 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: keybert
Successfully installed keybert-0.9.0


In [4]:
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity
import json
from pathlib import Path

In [6]:
#take up the cleaned data from phase 1 & merge the university data

repo_root = Path(".")
input_path = repo_root / "/content/Industry-Academia-alignment/outputs/processed_data/phase1_cleaned_text.json"

with open(input_path) as f:
    data = json.load(f)

industry_text = data["industry_text"]
university_text = (
    data["handbook_2022_26"] + " " + data["handbook_2023_27"]
)

**KeyBert** - keyword/keyphrase extraction method

it converts text & phrases into embeddings and selects most semantically similar phrases to be same

In [7]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
kw_model = KeyBERT(embedding_model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
def extract_candidate_skills(text, top_n=50):
    keywords = kw_model.extract_keywords(
        text,
        keyphrase_ngram_range=(1, 3),
        stop_words="english",
        top_n=top_n
    )
    return [kw for kw, score in keywords]

industry_candidates = extract_candidate_skills(industry_text)
university_candidates = extract_candidate_skills(university_text)

industry_candidates[:10], university_candidates[:10]


(['trainee software job',
  'software job',
  'developer intern job',
  'automation intern job',
  'developer intern aveva',
  'ai job role',
  'ai consultant job',
  'software developer intern',
  'software job role',
  'software engineer intern'],
 ['computer science engineering',
  'engineering computer science',
  'engineering computer',
  'design computer science',
  'computer science engineer',
  'computer science related',
  'computing technologies design',
  'studies computer science',
  'engineering international computer',
  'higher studies computer'])

In [9]:
def normalize_skills(skill_list, eps=0.3):
    embeddings = embedding_model.encode(skill_list)

    clustering = DBSCAN(
        eps=eps,
        min_samples=1,
        metric="cosine"
    ).fit(embeddings)

    clusters = {}
    for skill, label in zip(skill_list, clustering.labels_):
        clusters.setdefault(label, []).append(skill)

    # Pick longest phrase as canonical name
    normalized = {
        label: max(phrases, key=len)
        for label, phrases in clusters.items()
    }

    return normalized, clusters

industry_normalized, industry_clusters = normalize_skills(industry_candidates)
university_normalized, university_clusters = normalize_skills(university_candidates)

In [12]:
output_dir = repo_root / "outputs/processed_data"
output_dir.mkdir(exist_ok=True, parents=True)

def convert_keys_to_str(d):
    return {str(k): v for k, v in d.items()}

# Convert ALL dicts that have numeric keys
industry_normalized_json = convert_keys_to_str(industry_normalized)
university_normalized_json = convert_keys_to_str(university_normalized)
industry_clusters_json = convert_keys_to_str(industry_clusters)
university_clusters_json = convert_keys_to_str(university_clusters)

with open(output_dir / "industry_skills.json", "w") as f:
    json.dump(industry_normalized_json, f, indent=2)

with open(output_dir / "university_skills.json", "w") as f:
    json.dump(university_normalized_json, f, indent=2)

with open(output_dir / "industry_skill_clusters.json", "w") as f:
    json.dump(industry_clusters_json, f, indent=2)

with open(output_dir / "university_skill_clusters.json", "w") as f:
    json.dump(university_clusters_json, f, indent=2)

print("Phase 2 outputs saved successfully.")


Phase 2 outputs saved successfully.
