In [1]:
!pip install sentence-transformers pandas
!pip install huggingface_hub[hf_xet]



In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, util

In [2]:
data = {
    'job_id': [1, 2, 3],
    'job_title': ['Data Scientist', 'Full Stack Developer', 'ML Engineer'],
    'job_description': [
        "We are looking for a data scientist with experience in Python, machine learning, and statistics.",
        "Seeking a full-stack developer with expertise in JavaScript, React, and backend development.",
        "Hiring ML engineer skilled in deep learning, TensorFlow, and cloud deployment."
    ]
}

In [3]:
jobs_df = pd.DataFrame(data)
jobs_df

Unnamed: 0,job_id,job_title,job_description
0,1,Data Scientist,We are looking for a data scientist with exper...
1,2,Full Stack Developer,Seeking a full-stack developer with expertise ...
2,3,ML Engineer,"Hiring ML engineer skilled in deep learning, T..."


In [4]:
resume_text = """
Experienced ML engineer with a strong background in Python, deep learning, TensorFlow, and cloud deployment. 
Worked on end-to-end machine learning projects and production-ready solutions.
"""

In [5]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [6]:
# Encode resume
resume_embedding = model.encode(resume_text, convert_to_tensor=True)

# Encode all job descriptions
job_embeddings = model.encode(jobs_df['job_description'].tolist(), convert_to_tensor=True)

In [7]:
# Calculate similarity scores
similarities = util.pytorch_cos_sim(resume_embedding, job_embeddings)[0]

# Add similarity scores to DataFrame
jobs_df['similarity_score'] = similarities.cpu().numpy()

In [8]:
recommended_jobs = jobs_df.sort_values(by='similarity_score', ascending=False)

# Show top 3 matches
recommended_jobs[['job_title', 'similarity_score']].head(3)

Unnamed: 0,job_title,similarity_score
2,ML Engineer,0.839547
0,Data Scientist,0.583997
1,Full Stack Developer,0.442784


In [None]:
df = pd.read_csv('marketing_sample_for_naukri_com-jobs__20190701_20190830__30k_data.csv')

# Example: Prepare features and target (customize as needed)
# Let's say you want to predict 'Industry' based on 'Key Skills'
df = df.dropna(subset=['Key Skills', 'Industry'])
X = pd.get_dummies(df['Key Skills'])  # This is just a placeholder; use real feature engineering!
y = df['Industry']

# Encode target if multiclass
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
y_bin = lb.fit_transform(y)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y_bin, test_size=0.2, random_state=42)

# Train a classifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)

# Confusion Matrix
cm = confusion_matrix(y_test.argmax(axis=1), y_pred.argmax(axis=1))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=lb.classes_)
disp.plot(xticks_rotation='vertical')
plt.title('Confusion Matrix')
plt.show()

# ROC Curve (for one-vs-rest, first class)
fpr, tpr, _ = roc_curve(y_test[:, 0], y_proba[:, 0])
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (First Class)')
plt.legend(loc='lower right')
plt.show()