In [17]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack, csr_matrix

# ----------------------------
# Load Processed Dataset
# ----------------------------
df = pd.read_csv("processed_data.csv")

# Prepare features
df['Skills_List'] = df['Skills'].apply(lambda x: [i.strip() for i in str(x).split(',')])
df['Experience_Years'] = df['Experience_Years'].fillna(0)

# ----------------------------
# Feature Engineering
# ----------------------------
# TF-IDF on Title
title_vectorizer = TfidfVectorizer()
title_tfidf = title_vectorizer.fit_transform(df['Title'])

# Skills with MultiLabelBinarizer
mlb = MultiLabelBinarizer()
skills_encoded = mlb.fit_transform(df['Skills_List'])

# One-hot encode Location
location_encoded = pd.get_dummies(df['Location'], prefix='Loc')

# Experience scaling
scaler = StandardScaler()
experience = df[['Experience_Years']].values
experience_scaled = scaler.fit_transform(experience)

# Combine features
combined_features = hstack([title_tfidf, skills_encoded, location_encoded.values, experience_scaled])

# ----------------------------
# User Input
# ----------------------------
user_skills = ['python', 'machine learning', 'data analysis']
user_location = 'hybrid'
user_experience = 1
user_interest_title = 'data science intern'

# TF-IDF for title
user_title_vec = title_vectorizer.transform([user_interest_title])

# Filter user skills
known_skills = set(mlb.classes_)
user_skills_filtered = [skill for skill in user_skills if skill in known_skills]
user_skills_vec = mlb.transform([user_skills_filtered])

# Location vector alignment
user_location_vec = pd.get_dummies([user_location], prefix='Loc')
user_location_vec = user_location_vec.reindex(columns=location_encoded.columns, fill_value=0)

# Scale experience
user_exp_scaled = scaler.transform([[user_experience]])

# Convert dense arrays to sparse matrices
user_location_sparse = csr_matrix(user_location_vec.values.astype(float))
user_exp_sparse = csr_matrix(user_exp_scaled)

# Combine user input into vector
user_combined = hstack([user_title_vec, user_skills_vec, user_location_sparse, user_exp_sparse])

# ----------------------------
# Recommendation Logic
# ----------------------------
similarities = cosine_similarity(user_combined, combined_features).flatten()
top_indices = similarities.argsort()[-10:][::-1]
recommendations = df.iloc[top_indices]

# ----------------------------
# Show Results
# ----------------------------
print("\n📌 Top Internship Recommendations:\n")
for i, row in recommendations.iterrows():
    print(f"🔹 {row['Title'].title()} at {row['Company'].title()}")
    print(f"📍 Location: {row['Location']} | 🧑‍💼 Experience: {row['Experience_Years']} yrs")
    print(f"🛠️ Skills: {row['Skills']}")
    print(f"🔗 Link: {row['Link']}")
    print("-" * 60)




📌 Top Internship Recommendations:

🔹 Intern at Rohan Group
📍 Location: pune | 🧑‍💼 Experience: 0 yrs
🛠️ Skills: hr recruitment onboarding hrsd
🔗 Link: https://www.naukri.com/job-listings-intern-rohan-group-pune-0-to-0-years-210325017583
------------------------------------------------------------
🔹 Recruitment Intern at Roanuz
📍 Location: chennai | 🧑‍💼 Experience: 0 yrs
🛠️ Skills: business administration interpersonal skills intern social media campus recruitment hr recruitment administration
🔗 Link: https://www.naukri.com/job-listings-recruitment-intern-roanuz-chennai-0-to-3-years-200325509161
------------------------------------------------------------
🔹 Intern  Talent Acquisition at Sattva Consulting
📍 Location: gurugram | 🧑‍💼 Experience: 0 yrs
🛠️ Skills: stakeholder engagement data analysis excel intern talent acquisition staffing mis consulting
🔗 Link: https://www.naukri.com/job-listings-intern-talent-acquisition-sattva-consulting-gurugram-0-to-1-years-030425501947
---------------

In [19]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack, csr_matrix

# ----------------------------
# Load Internship Dataset
# ----------------------------
df = pd.read_csv("processed_data.csv")
df['Skills_List'] = df['Skills'].apply(lambda x: [i.strip() for i in str(x).split(',')])
df['Experience_Years'] = df['Experience_Years'].fillna(0)

# ----------------------------
# Feature Engineering
# ----------------------------
title_vectorizer = TfidfVectorizer()
title_tfidf = title_vectorizer.fit_transform(df['Title'])

mlb = MultiLabelBinarizer()
skills_encoded = mlb.fit_transform(df['Skills_List'])

location_encoded = pd.get_dummies(df['Location'], prefix='Loc')

scaler = StandardScaler()
experience = df[['Experience_Years']].values
experience_scaled = scaler.fit_transform(experience)

combined_features = hstack([title_tfidf, skills_encoded, location_encoded.values, experience_scaled])

# ----------------------------
# Load Test Dataset
# ----------------------------
test_df = pd.read_csv("test_candidates_dataset.csv")

# ----------------------------
# Recommendation + Accuracy Logic
# ----------------------------
def get_recommendations_for_candidate(skills, location, experience, title, top_n=10):
    filtered_skills = [skill.strip() for skill in skills.split(',') if skill.strip() in mlb.classes_]
    skills_vec = mlb.transform([filtered_skills])
    title_vec = title_vectorizer.transform([title])
    location_vec = pd.get_dummies([location], prefix='Loc').reindex(columns=location_encoded.columns, fill_value=0)
    location_sparse = csr_matrix(location_vec.values.astype(float))
    exp_scaled = scaler.transform([[experience]])
    exp_sparse = csr_matrix(exp_scaled)

    user_vec = hstack([title_vec, skills_vec, location_sparse, exp_sparse])
    sim = cosine_similarity(user_vec, combined_features).flatten()
    top_indices = sim.argsort()[-top_n:][::-1]
    return top_indices

# ----------------------------
# Evaluate Accuracy
# ----------------------------
correct = 0

for _, row in test_df.iterrows():
    skills = row["Skills"]
    location = "remote"  # Assumed location
    experience = 1  # Assumed experience
    interest = row["Recommended_Career"].lower()

    recommended_indices = get_recommendations_for_candidate(skills, location, experience, interest)
    recommended_titles = df.iloc[recommended_indices]["Title"].str.lower().values

    if any(interest in title for title in recommended_titles):
        correct += 1

accuracy = correct / len(test_df)
print(f"\n✅ Recommendation Accuracy on Test Dataset: {accuracy:.2%}")



✅ Recommendation Accuracy on Test Dataset: 12.00%
