In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import pickle
import os

In [None]:
# --------------------------
# 3Ô∏è‚É£ Load Dataset
# --------------------------
# Use your Windows dataset path here
data_path = r"C:\Users\MASTER COMPUTERS\Desktop\websitetask\facebook\Documents\Downloads\archive"

postings = pd.read_csv(f"{data_path}\\linkedin_job_postings.csv")
summaries = pd.read_csv(f"{data_path}\\job_summary.csv")
skills_df = pd.read_csv(f"{data_path}\\job_skills.csv")

# Merge datasets
df = postings.merge(summaries, on="job_link", how="inner").merge(skills_df, on="job_link", how="inner")
df['text'] = df['job_summary'].fillna('') + " " + df['job_skills'].fillna('')
print(f"Dataset shape: {df.shape}")
df.head()


In [None]:
# --------------------------
# 4Ô∏è‚É£ Data Cleaning
# --------------------------
# Drop duplicates and empty text
df.drop_duplicates(subset=['text'], inplace=True)
df = df[df['text'].str.strip() != '']
print(f"Cleaned dataset shape: {df.shape}")

# Visualize top 20 job titles
plt.figure(figsize=(12,6))
top_titles = df['job_title'].value_counts().head(20)
sns.barplot(y=top_titles.index, x=top_titles.values)
plt.title("Top 20 Job Titles")
plt.xlabel("Count")
plt.ylabel("Job Title")
plt.show()

In [None]:
# --------------------------
# 5Ô∏è‚É£ TF-IDF Vectorization
# --------------------------
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(df['text']).toarray()
print(f"TF-IDF feature matrix shape: {X.shape}")

# Convert to torch tensor
X_tensor = torch.tensor(X, dtype=torch.float32)


In [None]:
# --------------------------
# 6Ô∏è‚É£ Define Deep NLP Model
# --------------------------
class CareerNN(nn.Module):
    def __init__(self, input_dim, embedding_dim=128):
        super(CareerNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(512, embedding_dim)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

device = "cuda" if torch.cuda.is_available() else "cpu"
model = CareerNN(input_dim=X.shape[1], embedding_dim=128).to(device)
print(model)

In [None]:
# --------------------------
# 7Ô∏è‚É£ Self-Supervised Training
# --------------------------
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
epochs = 5

X_tensor = X_tensor.to(device)

for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    embeddings = model(X_tensor)
    reconstructed = torch.matmul(embeddings, embeddings.T) @ X_tensor
    loss = criterion(reconstructed, X_tensor)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

In [None]:
--------------------------
# 8Ô∏è‚É£ Generate Career Embeddings
# --------------------------
model.eval()
with torch.no_grad():
    career_embeddings = model(X_tensor).cpu().numpy()

# Save embeddings
with open(f"{data_path}\\career_embeddings.pkl", "wb") as f:
    pickle.dump(career_embeddings, f)
print("Career embeddings saved!")

In [None]:
# --------------------------
# 9Ô∏è‚É£ Recommendation Function
# --------------------------
def recommend_jobs(user_input, model, vectorizer, career_embeddings, df, top_k=5):
    user_vec = vectorizer.transform([user_input]).toarray()
    user_tensor = torch.tensor(user_vec, dtype=torch.float32).to(device)
    
    model.eval()
    with torch.no_grad():
        user_emb = model(user_tensor).cpu().numpy()
    
    sims = cosine_similarity(user_emb, career_embeddings)[0]
    top_idx = sims.argsort()[-top_k:][::-1]
    
    results = df.iloc[top_idx].copy()
    results['score'] = sims[top_idx]
    return results


In [None]:
# üîü Test Recommendations
# --------------------------
user_query = "I enjoy analyzing data and building AI models"
recommendations = recommend_jobs(user_query, model, vectorizer, career_embeddings, df, top_k=5)
recommendations[['job_title','company','score']]

In [None]:
# --------------------------
# 1Ô∏è‚É£1Ô∏è‚É£ Visualize Top Recommendations
# --------------------------
plt.figure(figsize=(8,6))
sns.barplot(x=recommendations['score'], y=recommendations['job_title'], palette='viridis')
plt.title("Top Job Recommendations")
plt.xlabel("Similarity Score")
plt.ylabel("Job Title")
plt.show()

In [None]:
# --------------------------
# 1Ô∏è‚É£2Ô∏è‚É£ Save Model & Vectorizer
# --------------------------
torch.save(model.state_dict(), f"{data_path}\\career_model.pth")
with open(f"{data_path}\\tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)
print("Model and vectorizer saved!")
