In [None]:
import re
import spacy
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Load NLP models
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words("english"))

# Load SBERT model
model = SentenceTransformer("all-MiniLM-L6-v2")

# === 1. Preprocessing Function ===
def preprocess_text(text):
    """Cleans and normalizes free-text bios."""
    text = text.lower()  # Lowercasing
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    
    # NLP Processing with spaCy
    doc = nlp(text)
    
    # Tokenization, Stopword Removal, and Lemmatization
    processed_tokens = [token.lemma_ for token in doc if token.text not in stop_words and not token.is_punct]
    
    return " ".join(processed_tokens)

# === 2. Named Entity Extraction ===
def extract_entities(text):
    """Extracts named entities like skills and organizations."""
    doc = nlp(text)
    skills, organizations = [], []
    
    for ent in doc.ents:
        if ent.label_ == "ORG":  # Organizations
            organizations.append(ent.text)
        elif ent.label_ in ["NORP", "PRODUCT", "WORK_OF_ART"]:  # Skills-related
            skills.append(ent.text)

    return {"skills": skills, "organizations": organizations}

# === 3. Entity-Enhanced Bio Processing ===
def preprocess_text_with_entities(text):
    """Preprocesses bio and appends extracted entities."""
    clean_text = preprocess_text(text)
    entities = extract_entities(text)

    if entities["skills"]:
        clean_text += " [skills: " + ", ".join(entities["skills"]) + "]"
    if entities["organizations"]:
        clean_text += " [organizations: " + ", ".join(entities["organizations"]) + "]"

    return clean_text

# === 4. Sample Data ===
bios = {
    "ML Eng": "Dr. Jane Doe is a Machine Learning Engineer at Google, specializing in NLP & AI.",
    "Cyber Analyst": "John Smith is a cybersecurity analyst working on network security and penetration testing.",
    "Financial Analyst": "Alice is a financial analyst with expertise in investment banking and risk management.",
    "Software Eng": "Bob is a software engineer focusing on backend development and distributed systems.",
}

topics = ["Machine Learning", "Artificial Intelligence", "Cybersecurity", "Finance", "Software Engineering"]

# === 5. Apply Preprocessing & Extract Entities ===
processed_bios = [preprocess_text_with_entities(bios[bio]) for bio in bios]

# === 6. Generate Embeddings ===
bio_embeddings = model.encode(processed_bios, convert_to_tensor=False)
topic_embeddings = model.encode(topics, convert_to_tensor=False)

# Combine all embeddings
all_embeddings = np.vstack([bio_embeddings, topic_embeddings])

# === 7. Dimensionality Reduction ===
dim_reduction = PCA(n_components=2)  # Switch to TSNE(n_components=2, perplexity=5) for better clustering
reduced_embeddings = dim_reduction.fit_transform(all_embeddings)

# Split back into bios and topics
bio_reduced = reduced_embeddings[: len(bios)]
topic_reduced = reduced_embeddings[len(bios):]

# === 8. Visualization ===
plt.figure(figsize=(10, 6))
sns.set(style="whitegrid")

# Plot bios
plt.scatter(bio_reduced[:, 0], bio_reduced[:, 1], c="blue", label="Bios", alpha=0.7, edgecolors="k")

# Plot topics
plt.scatter(topic_reduced[:, 0], topic_reduced[:, 1], c="red", label="Topics", marker="X", s=100, edgecolors="k")

# Annotate bios
for i, txt in enumerate(bios):
    plt.annotate(f"{txt}", (bio_reduced[i, 0], bio_reduced[i, 1]), fontsize=9)

# Annotate topics
for i, txt in enumerate(topics):
    plt.annotate(txt, (topic_reduced[i, 0], topic_reduced[i, 1]), fontsize=10, fontweight="bold")

plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.title("Visualization of Bio & Topic Embeddings (With Entity Enrichment)")
plt.legend()
plt.show()
