In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack, csr_matrix
from sentence_transformers import SentenceTransformer

# Set custom cache directory to avoid permission issues
import os
os.environ['TRANSFORMERS_CACHE'] = './cache'
os.environ['SENTENCE_TRANSFORMERS_HOME'] = './cache'

# ----------------------------
# Load Internship Dataset
# ----------------------------
df = pd.read_csv("processed_data.csv")
df['Skills_List'] = df['Skills'].apply(lambda x: [i.strip() for i in str(x).split(',')])
df['Experience_Years'] = df['Experience_Years'].fillna(0)

# ----------------------------
# Skill + Location + Experience Features
# ----------------------------
mlb = MultiLabelBinarizer()
skills_encoded = mlb.fit_transform(df['Skills_List'])

location_encoded = pd.get_dummies(df['Location'], prefix='Loc')

scaler = StandardScaler()
experience = df[['Experience_Years']].values
experience_scaled = scaler.fit_transform(experience)

non_title_features = hstack([
    csr_matrix(skills_encoded),
    csr_matrix(location_encoded.values),
    csr_matrix(experience_scaled)
])

# ----------------------------
# SentenceTransformer for Title Embeddings
# ----------------------------
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode all titles
title_texts = df['Title'].tolist()
title_embeddings = model.encode(title_texts)

# ----------------------------
# Load Test Dataset
# ----------------------------
test_df = pd.read_csv("test_candidates_dataset.csv")

# ----------------------------
# Recommendation + Evaluation
# ----------------------------
def get_top_k_indices(title_query, skills, experience, location, k=5):
    # Encode query title
    query_embedding = model.encode([title_query])

    # Compute cosine similarity with all title embeddings
    title_sim_scores = cosine_similarity(query_embedding, title_embeddings)[0]

    # Process skills
    filtered_skills = [skill.strip() for skill in skills.split(',') if skill.strip() in mlb.classes_]
    skills_vec = mlb.transform([filtered_skills])

    # Location
    location_vec = pd.get_dummies([location], prefix='Loc').reindex(columns=location_encoded.columns, fill_value=0)
    location_sparse = csr_matrix(location_vec.values.astype(float))

    # Experience
    exp_scaled = scaler.transform([[experience]])
    exp_sparse = csr_matrix(exp_scaled)

    user_other_features = hstack([skills_vec, location_sparse, exp_sparse])
    job_sim = cosine_similarity(user_other_features, non_title_features).flatten()

    # Combine scores
    final_scores = 0.6 * title_sim_scores + 0.4 * job_sim
    top_k = final_scores.argsort()[-k:][::-1]
    return top_k

# ----------------------------
# Precision@5 Evaluation
# ----------------------------
relevant_count = 0

for _, row in test_df.iterrows():
    career = row["Recommended_Career"].lower()
    top_indices = get_top_k_indices(
        title_query=career,
        skills=row["Skills"],
        experience=1,
        location="remote"
    )
    recommended_titles = df.iloc[top_indices]["Title"].str.lower().tolist()
    match = any(career in title for title in recommended_titles)

    if match:
        relevant_count += 1

precision_at_5 = relevant_count / len(test_df)
print(f"\n🎯 Precision@5: {precision_at_5:.2%}")



  from .autonotebook import tqdm as notebook_tqdm




🎯 Precision@5: 33.00%


In [11]:
def get_top_k_indices(title_query, skills, experience, location, k=5):
    # Encode query title using the same model as for titles
    query_embedding = model.encode([title_query])

    # Compute cosine similarity with all title embeddings
    title_sim_scores = cosine_similarity(query_embedding, title_embeddings)[0]

    # Process skills
    filtered_skills = [skill.strip() for skill in skills.split(',') if skill.strip() in mlb.classes_]
    skills_vec = mlb.transform([filtered_skills])

    # Location
    location_vec = pd.get_dummies([location], prefix='Loc').reindex(columns=location_encoded.columns, fill_value=0)
    location_sparse = csr_matrix(location_vec.values.astype(float))

    # Experience
    exp_scaled = scaler.transform([[experience]])
    exp_sparse = csr_matrix(exp_scaled)

    user_other_features = hstack([skills_vec, location_sparse, exp_sparse])
    job_sim = cosine_similarity(user_other_features, non_title_features).flatten()

    # Combine scores
    final_scores = 0.7 * title_sim_scores + 0.3 * job_sim
    top_k = final_scores.argsort()[-k:][::-1]

    # Output scores for top recommendations
    recommended_titles = df.iloc[top_k]["Title"].str.lower().tolist()
    recommended_scores = final_scores[top_k]
    return top_k, recommended_titles, recommended_scores

# Ensure model is correctly defined
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode all titles
title_texts = df['Title'].tolist()
title_embeddings = model.encode(title_texts)


In [12]:
import numpy as np
from sklearn.metrics import average_precision_score
from sklearn.metrics import ndcg_score

def calculate_map(df, test_df):
    ap_scores = []
    for _, row in test_df.iterrows():
        career = row["Recommended_Career"].lower()
        top_indices, _, _ = get_top_k_indices(
            title_query=career,
            skills=row["Skills"],
            experience=1,
            location="remote"
        )
        
        # Ensure top_indices is a list or array of indices
        recommended_titles = df.loc[top_indices, "Title"].str.lower().tolist()
        
        # Calculate relevance
        relevance = [1 if career in title else 0 for title in recommended_titles]
        
        # Corrected average_precision_score usage
        # You need to pass the true labels and the predicted probabilities or scores
        # Here, we assume relevance is the true label and we need to calculate the predicted scores
        # For simplicity, let's use the relevance as scores (not ideal but for demonstration)
        ap = average_precision_score(relevance, relevance)
        
        ap_scores.append(ap)
    
    # Calculate MAP
    map_score = sum(ap_scores) / len(ap_scores)
    return map_score

def calculate_ndcg(df, test_df):
    ndcg_scores = []
    for _, row in test_df.iterrows():
        career = row["Recommended_Career"].lower()
        top_indices, _, _ = get_top_k_indices(
            title_query=career,
            skills=row["Skills"],
            experience=1,
            location="remote"
        )
        
        recommended_titles = df.loc[top_indices, "Title"].str.lower().tolist()
        relevance = [2 if career in title else 1 for title in recommended_titles]  # Assuming graded relevance
        
        # Ideal DCG
        ideal_dcg = dcg_at_k(sorted(relevance, reverse=True), k=len(relevance))
        
        # Actual DCG
        actual_dcg = dcg_at_k(relevance, k=len(relevance))
        
        # Calculate NDCG
        if ideal_dcg == 0:
            ndcg = 0
        else:
            ndcg = actual_dcg / ideal_dcg
        
        ndcg_scores.append(ndcg)
    
    # Calculate NDCG
    ndcg_score = sum(ndcg_scores) / len(ndcg_scores)
    return ndcg_score

def dcg_at_k(relevance, k):
    dcg = 0.
    for i in range(min(k, len(relevance))):
        dcg += (2**relevance[i] - 1) / np.log2(i+2)
    return dcg

# Calculate MAP and NDCG
map_score = calculate_map(df, test_df)
ndcg_score = calculate_ndcg(df, test_df)

print(f"MAP: {map_score:.2f}, NDCG: {ndcg_score:.2f}")




MAP: 0.37, NDCG: 0.99


In [14]:
def get_reputation(company_name):
    # Example implementation: This could be a lookup in a database or API call
    # For simplicity, let's assume we have a dictionary mapping company names to reputations
    company_reputations = {
        "Google": 5,
        "Microsoft": 4.5,
        "Amazon": 4.8,
        # Add more companies here...
    }
    
    return company_reputations.get(company_name, 0)  # Default to 0 if company not found

# Now you can use this function
df['Company_Reputation'] = df['Company'].apply(lambda x: get_reputation(x))


In [16]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack, csr_matrix
from sentence_transformers import SentenceTransformer
import numpy as np
import requests

# Load the processed dataset
df = pd.read_csv("processed_data.csv")
df['Skills_List'] = df['Skills'].fillna("").apply(lambda x: [i.strip() for i in str(x).split(',')])
df['Experience_Years'] = df['Experience_Years'].fillna(0)

# Convert Experience from string to numeric using a regex-based function
def convert_experience(exp_str):
    if pd.isna(exp_str):
        return 0
    exp_str = exp_str.lower()
    if "+" in exp_str:
        return float(re.findall(r"\d+", exp_str)[0]) + 1
    numbers = list(map(int, re.findall(r"\d+", exp_str)))
    if len(numbers) == 2:
        return sum(numbers) / 2
    elif len(numbers) == 1:
        return numbers[0]
    return 0

df['Experience'] = df['Experience'].apply(convert_experience)

# Vectorizers
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode all titles
title_texts = df['Title'].tolist()
title_embeddings = model.encode(title_texts)

# Skill encoding
mlb = MultiLabelBinarizer()
skills_encoded = mlb.fit_transform(df['Skills_List'])

# Location encoding
location_encoded = pd.get_dummies(df['Location'], prefix='Loc')

# Experience scaling
scaler = StandardScaler()
experience = df[['Experience_Years']].values
experience_scaled = scaler.fit_transform(experience)

# Combine all features
non_title_features = hstack([
    csr_matrix(skills_encoded),
    csr_matrix(location_encoded.values),
    csr_matrix(experience_scaled)
])

# Mock company reputation function
def get_reputation(company_name):
    mock_reputations = {
        "Google": 5,
        "Microsoft": 4.8,
        "Amazon": 4.7,
    }
    return mock_reputations.get(company_name, 0)

# Add company reputation as a feature
df['Company_Reputation'] = df['Company'].apply(lambda x: get_reputation(x))

# Scale company reputation
reputation_scaled = scaler.fit_transform(df[['Company_Reputation']])

# Update non_title_features to include company reputation
non_title_features = hstack([
    csr_matrix(skills_encoded),
    csr_matrix(location_encoded.values),
    csr_matrix(experience_scaled),
    csr_matrix(reputation_scaled)
])

# Load test dataset
test_df = pd.read_csv("test_candidates_dataset.csv")

# Function to get top k indices
def get_top_k_indices(title_query, skills, experience, location, k=5):
    # Encode query title
    query_embedding = model.encode([title_query])
    
    # Compute cosine similarity with all title embeddings
    title_sim_scores = cosine_similarity(query_embedding, title_embeddings)[0]
    
    # Process skills
    filtered_skills = [skill.strip() for skill in skills.split(',') if skill.strip() in mlb.classes_]
    skills_vec = mlb.transform([filtered_skills])
    
    # Location
    location_vec = pd.get_dummies([location], prefix='Loc').reindex(columns=location_encoded.columns, fill_value=0)
    location_sparse = csr_matrix(location_vec.values.astype(float))
    
    # Experience
    exp_scaled = scaler.transform([[experience]])
    exp_sparse = csr_matrix(exp_scaled)
    
    # Company reputation
    reputation = get_reputation(location)
    reputation_scaled = scaler.transform([[reputation]])
    reputation_sparse = csr_matrix(reputation_scaled)
    
    user_other_features = hstack([skills_vec, location_sparse, exp_sparse, reputation_sparse])
    job_sim = cosine_similarity(user_other_features, non_title_features).flatten()
    
    # Combine scores
    final_scores = 0.7 * title_sim_scores + 0.3 * job_sim
    top_k = final_scores.argsort()[-k:][::-1]
    
    return top_k

# Calculate MAP and NDCG
def calculate_map(df, test_df):
    ap_scores = []
    for _, row in test_df.iterrows():
        career = row["Recommended_Career"].lower()
        top_indices = get_top_k_indices(
            title_query=career,
            skills=row["Skills"],
            experience=1,
            location="remote"
        )
        
        recommended_titles = df.loc[top_indices, "Title"].str.lower().tolist()
        relevance = [1 if career in title else 0 for title in recommended_titles]
        
        # Corrected average_precision_score usage
        ap = sum([relevance[i] for i in range(len(relevance)) if relevance[i] == 1]) / len(relevance) if any(relevance) else 0
        
        ap_scores.append(ap)
    
    # Calculate MAP
    map_score = sum(ap_scores) / len(ap_scores)
    return map_score

def calculate_ndcg(df, test_df):
    ndcg_scores = []
    for _, row in test_df.iterrows():
        career = row["Recommended_Career"].lower()
        top_indices = get_top_k_indices(
            title_query=career,
            skills=row["Skills"],
            experience=1,
            location="remote"
        )
        
        recommended_titles = df.loc[top_indices, "Title"].str.lower().tolist()
        relevance = [2 if career in title else 1 for title in recommended_titles]  # Assuming graded relevance
        
        # Ideal DCG
        ideal_dcg = dcg_at_k(sorted(relevance, reverse=True), k=len(relevance))
        
        # Actual DCG
        actual_dcg = dcg_at_k(relevance, k=len(relevance))
        
        # Calculate NDCG
        if ideal_dcg == 0:
            ndcg = 0
        else:
            ndcg = actual_dcg / ideal_dcg
        
        ndcg_scores.append(ndcg)
    
    # Calculate NDCG
    ndcg_score = sum(ndcg_scores) / len(ndcg_scores)
    return ndcg_score

def dcg_at_k(relevance, k):
    dcg = 0.
    for i in range(min(k, len(relevance))):
        dcg += (2**relevance[i] - 1) / np.log2(i+2)
    return dcg

# Calculate MAP and NDCG
map_score = calculate_map(df, test_df)
ndcg_score = calculate_ndcg(df, test_df)

print(f"MAP: {map_score:.2f}, NDCG: {ndcg_score:.2f}")


NameError: name 're' is not defined