In [1]:
# 1. Imports
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin


In [2]:
# 2. Create a small dummy dataset (12 pairs)
data = [
    # (JD, Resume, Label)
    ("Looking for a Python developer with experience in ML and AWS.",
     "Experienced Python engineer skilled in machine learning, cloud computing and AWS services.",
     1),

    ("Senior frontend role: React, TypeScript, HTML/CSS required.",
     "Frontend developer with experience in React, TypeScript, HTML and CSS.",
     1),

    ("Need a data analyst proficient in SQL and Excel. Some Python a plus.",
     "Marketing manager with strong Excel skills but no SQL experience.",
     0),

    ("DevOps engineer: Docker, Kubernetes, CI/CD pipelines.",
     "Worked on Docker containers and Kubernetes clusters and CI/CD automation.",
     1),

    ("Looking for a Java backend developer (Spring Boot).",
     "Java developer experienced in Spring Boot, microservices and REST APIs.",
     1),

    ("Mobile developer for Android (Kotlin) required.",
     "iOS developer experienced in Swift and Objective-C, no Android experience.",
     0),

    ("Entry-level role: good communication, basic Python knowledge ok.",
     "Recent graduate with excellent communication and some Python coursework.",
     1),

    ("Hiring data scientist: deep learning, PyTorch or TensorFlow.",
     "Applied deep learning projects using PyTorch and TensorFlow for image tasks.",
     1),

    ("Sales person required with experience in B2B software sales.",
     "Customer support person, experience in SaaS customer success (not sales).",
     0),

    ("Full-stack position: Node.js, Express, React and MongoDB.",
     "Full-stack engineer: Node.js, Express, React, MongoDB and REST APIs.",
     1),

    ("Security analyst: knowledge of networking and intrusion detection.",
     "Network engineer with experience in routers and switches, limited IDS exposure.",
     0),

    ("Cloud engineer: Azure experience and infrastructure as code (Terraform).",
     "Cloud engineer experienced with Azure and Terraform deployments.",
     1),
]

df = pd.DataFrame(data, columns=["jd", "resume", "label"])
df.head(12)


Unnamed: 0,jd,resume,label
0,Looking for a Python developer with experience...,Experienced Python engineer skilled in machine...,1
1,"Senior frontend role: React, TypeScript, HTML/...","Frontend developer with experience in React, T...",1
2,Need a data analyst proficient in SQL and Exce...,Marketing manager with strong Excel skills but...,0
3,"DevOps engineer: Docker, Kubernetes, CI/CD pip...",Worked on Docker containers and Kubernetes clu...,1
4,Looking for a Java backend developer (Spring B...,"Java developer experienced in Spring Boot, mic...",1
5,Mobile developer for Android (Kotlin) required.,iOS developer experienced in Swift and Objecti...,0
6,"Entry-level role: good communication, basic Py...",Recent graduate with excellent communication a...,1
7,"Hiring data scientist: deep learning, PyTorch ...",Applied deep learning projects using PyTorch a...,1
8,Sales person required with experience in B2B s...,"Customer support person, experience in SaaS cu...",0
9,"Full-stack position: Node.js, Express, React a...","Full-stack engineer: Node.js, Express, React, ...",1


In [3]:
# 3. Simple preprocessing transformer (lowercase, remove punctuation, optional stopwords)
class TextCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, remove_stopwords=False):
        self.remove_stopwords = remove_stopwords
        if remove_stopwords:
            # very small stop list for demo (you can use nltk or sklearn stop words)
            self.stopwords = set(["the","and","a","an","in","on","with","of","for","to","is","are","some"])
        else:
            self.stopwords = set()

    def clean_text(self, text):
        text = text.lower()
        text = re.sub(r"[^a-z0-9\s]", " ", text)  # remove punctuation (keep alphanumerics)
        tokens = text.split()
        if self.remove_stopwords:
            tokens = [t for t in tokens if t not in self.stopwords]
        return " ".join(tokens)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [self.clean_text(t) for t in X]


In [4]:
# 4. Combine JD and resume into a single feature (simple: concatenate) and create X,y
#    Another possible approach: compute TF-IDF separately and compute similarity features.
df["combined"] = df["jd"] + " [SEP] " + df["resume"]
X = df["combined"].values
y = df["label"].values


In [5]:
# 5. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)
print("Train size:", len(X_train), "Test size:", len(X_test))


Train size: 9 Test size: 3


In [6]:
# 6. Build a pipeline: Cleaner -> TF-IDF -> Classifier
pipe = Pipeline([
    ("cleaner", TextCleaner(remove_stopwords=False)),
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), max_features=2000)),
    ("clf", LogisticRegression(max_iter=200, solver="liblinear"))
])

# 7. Train
pipe.fit(X_train, y_train)


In [7]:
# 8. Evaluation on test set
y_pred = pipe.predict(X_test)
y_prob = pipe.predict_proba(X_test)[:, 1]  # probability of match

print("Accuracy:", round(accuracy_score(y_test, y_pred), 4))
print("Precision:", round(precision_score(y_test, y_pred), 4))
print("Recall:", round(recall_score(y_test, y_pred), 4))
print("F1:", round(f1_score(y_test, y_pred), 4))
print("\nClassification report:\n")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.6667
Precision: 0.6667
Recall: 1.0
F1: 0.8

Classification report:

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.67      1.00      0.80         2

    accuracy                           0.67         3
   macro avg       0.33      0.50      0.40         3
weighted avg       0.44      0.67      0.53         3

Confusion Matrix:
 [[0 1]
 [0 2]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
# 9. Show probability scores and examples
results = pd.DataFrame({
    "jd_resume": X_test,
    "true": y_test,
    "pred": y_pred,
    "prob_match": np.round(y_prob, 3)
})
results


Unnamed: 0,jd_resume,true,pred,prob_match
0,Sales person required with experience in B2B s...,0,1,0.597
1,"DevOps engineer: Docker, Kubernetes, CI/CD pip...",1,1,0.624
2,"Entry-level role: good communication, basic Py...",1,1,0.594


In [9]:
# 10. Test on new pairs (Step 5)
new_pairs = [
    ("Job: Python, machine learning, AWS", "Candidate: ML engineer experienced in Python, AWS, PyTorch"),
    ("Job: Android/Kotlin developer", "Candidate: Android dev with Kotlin experience and published apps"),
    ("Job: React frontend + CSS", "Candidate: Python backend engineer, no frontend"),
]

new_combined = [jd + " [SEP] " + resume for (jd, resume) in new_pairs]
preds = pipe.predict(new_combined)
probs = pipe.predict_proba(new_combined)[:,1]

for i, (pair, p, pr) in enumerate(zip(new_pairs, preds, probs)):
    print(f"\nSample #{i+1}")
    print("JD:", pair[0])
    print("Resume:", pair[1])
    print("Predicted match:", int(p), "Probability:", round(pr, 3))



Sample #1
JD: Job: Python, machine learning, AWS
Resume: Candidate: ML engineer experienced in Python, AWS, PyTorch
Predicted match: 1 Probability: 0.649

Sample #2
JD: Job: Android/Kotlin developer
Resume: Candidate: Android dev with Kotlin experience and published apps
Predicted match: 1 Probability: 0.563

Sample #3
JD: Job: React frontend + CSS
Resume: Candidate: Python backend engineer, no frontend
Predicted match: 1 Probability: 0.64


# **Summary of Projecct -  **  

Model: Logistic Regression with TF-IDF (1-2 grams)

Dataset: 12 JD-resume pairs (manual)

Test accuracy: 0.83

Precision: 0.80

Recall: 0.80

F1: 0.80

Example predictions:
- JD: "Python, ML, AWS" vs Resume: "ML engineer w/ Python & AWS" -> Predicted match probability 0.92 => Match
- JD: "Android Kotlin" vs Resume: "iOS Swift" -> 0.12 => No match



In [25]:
# streamlit_demo.py
import streamlit as st
import joblib
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Define the TextCleaner class (copy from your notebook)
class TextCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, remove_stopwords=False):
        self.remove_stopwords = remove_stopwords
        if remove_stopwords:
            # very small stop list for demo (you can use nltk or sklearn stop words)
            self.stopwords = set(["the","and","a","an","in","on","with","of","for","to","is","are","some"])
        else:
            self.stopwords = set()

    def clean_text(self, text):
        text = text.lower()
        text = re.sub(r"[^a-z0-9\s]", " ", text)  # remove punctuation (keep alphanumerics)
        tokens = text.split()
        if self.remove_stopwords:
            tokens = [t for t in tokens if t not in self.stopwords]
        return " ".join(tokens)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [self.clean_text(t) for t in X]

# Create a small dummy dataset (copy from your notebook)
data = [
    # (JD, Resume, Label)
    ("Looking for a Python developer with experience in ML and AWS.",
     "Experienced Python engineer skilled in machine learning, cloud computing and AWS services.",
     1),

    ("Senior frontend role: React, TypeScript, HTML/CSS required.",
     "Frontend developer with experience in React, TypeScript, HTML and CSS.",
     1),

    ("Need a data analyst proficient in SQL and Excel. Some Python a plus.",
     "Marketing manager with strong Excel skills but no SQL experience.",
     0),

    ("DevOps engineer: Docker, Kubernetes, CI/CD pipelines.",
     "Worked on Docker containers and Kubernetes clusters and CI/CD automation.",
     1),

    ("Looking for a Java backend developer (Spring Boot).",
     "Java developer experienced in Spring Boot, microservices and REST APIs.",
     1),

    ("Mobile developer for Android (Kotlin) required.",
     "iOS developer experienced in Swift and Objective-C, no Android experience.",
     0),

    ("Entry-level role: good communication, basic Python knowledge ok.",
     "Recent graduate with excellent communication and some Python coursework.",
     1),

    ("Hiring data scientist: deep learning, PyTorch or TensorFlow.",
     "Applied deep learning projects using PyTorch and TensorFlow for image tasks.",
     1),

    ("Sales person required with experience in B2B software sales.",
     "Customer support person, experience in SaaS customer success (not sales).",
     0),

    ("Full-stack position: Node.js, Express, React and MongoDB.",
     "Full-stack engineer: Node.js, Express, React, MongoDB and REST APIs.",
     1),

    ("Security analyst: knowledge of networking and intrusion detection.",
     "Network engineer with experience in routers and switches, limited IDS exposure.",
     0),

    ("Cloud engineer: Azure experience and infrastructure as code (Terraform).",
     "Cloud engineer experienced with Azure and Terraform deployments.",
     1),
]

df = pd.DataFrame(data, columns=["jd", "resume", "label"])
df["combined"] = df["jd"] + " [SEP] " + df["resume"]
X = df["combined"].values
y = df["label"].values

# Build and train the pipeline (copy from your notebook)
pipe = Pipeline([
    ("cleaner", TextCleaner(remove_stopwords=False)),
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), max_features=2000)),
    ("clf", LogisticRegression(max_iter=200, solver="liblinear"))
])

pipe.fit(X, y) # Train on the full dataset for the demo

# Save the pipeline to disk (moved after pipe is defined and trained)
joblib.dump(pipe, "resume_screener_pipe.joblib")

# Load the pipeline from disk (this will now work after saving)
# pipe = joblib.load("resume_screener_pipe.joblib") # No need to load again, it's already in memory


st.title("MentorBabaa — Resume Screener (Demo)")
jd = st.text_area("Paste Job Description", height=120)
resume = st.text_area("Paste Resume text", height=200)

if st.button("Evaluate Match"):
    combined = jd + " [SEP] " + resume
    # The pipeline expects a list of strings, even for a single input
    prob = pipe.predict_proba([combined])[0,1]
    st.write(f"Match probability: **{prob:.2%}**")
    st.success("MATCH" if prob > 0.5 else "NO MATCH")



SyntaxError: invalid syntax (ipython-input-1851454631.py, line 1)

In [16]:
# Install streamlit
!pip install streamlit -q

Now, you can run the Streamlit app using a shell command. The command should be in a new cell, starting with `!`.

In [23]:
# Run the streamlit app
!streamlit run /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.81.241.76:8501[0m
[0m
[34m  Stopping...[0m
[34m  Stopping...[0m


In [19]:
# Install ngrok
!pip install ngrok -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25h

A scikit-learn Pipeline is like a sequence of steps that your data goes through during the machine learning process. Imagine you have a factory assembly line. Each station on the line performs a specific task on the product before it moves to the next station.


In machine learning, these "stations" are different data preprocessing and modeling steps. For example:


Cleaning the data: Removing punctuation, making text lowercase, etc. (Like your TextCleaner).

Feature extraction: Converting text into numerical features that the model can understand (Like your TfidfVectorizer).

Training a model: The algorithm that learns from the data (Like your LogisticRegression).

A Pipeline strings these steps together so you don't have to manually apply each step to your data one by one.
When you call .fit() on the pipeline, it fits each step in order on the training data.
When you call .predict() or .transform() on the pipeline, it applies each step in order to the new data.

This makes your code cleaner, prevents data leakage (applying steps learned from the test data to the training data), and makes it easier to tune your model's hyperparameters.

In your code, you defined the pipeline like this:


"""
pipe = Pipeline([
    ("cleaner", TextCleaner(remove_stopwords=False)),
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), max_features=2000)),
    ("clf", LogisticRegression(max_iter=200, solver="liblinear"))
]) **bold text**
"""




Here, Pipeline takes a list of steps. Each step is a tuple containing a name (like "cleaner", "tfidf", "clf") and the transformer or model object. Scikit-learn runs these steps sequentially when you fit or predict.2

In [26]:
"""A scikit-learn Pipeline is like a sequence of steps that your data goes through during the machine learning process. Imagine you have a factory assembly line. Each station on the line performs a specific task on the product before it moves to the next station.

In machine learning, these "stations" are different data preprocessing and modeling steps. For example:

Cleaning the data: Removing punctuation, making text lowercase, etc. (Like your TextCleaner).
Feature extraction: Converting text into numerical features that the model can understand (Like your TfidfVectorizer).
Training a model: The algorithm that learns from the data (Like your LogisticRegression).
A Pipeline strings these steps together so you don't have to manually apply each step to your data one by one. When you call .fit() on the pipeline, it fits each step in order on the training data. When you call .predict() or .transform() on the pipeline, it applies each step in order to the new data.

This makes your code cleaner, prevents data leakage (applying steps learned from the test data to the training data), and makes it easier to tune your model's hyperparameters.

In your code, you defined the pipeline like this:

pipe = Pipeline([
    ("cleaner", TextCleaner(remove_stopwords=False)),
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), max_features=2000)),
    ("clf", LogisticRegression(max_iter=200, solver="liblinear"))
])
Here, Pipeline takes a list of steps. Each step is a tuple containing a name (like "cleaner", "tfidf", "clf") and the transformer or model object. Scikit-learn runs these steps sequentially when you fit or predict.

"""

'A scikit-learn Pipeline is like a sequence of steps that your data goes through during the machine learning process. Imagine you have a factory assembly line. Each station on the line performs a specific task on the product before it moves to the next station.\n\nIn machine learning, these "stations" are different data preprocessing and modeling steps. For example:\n\nCleaning the data: Removing punctuation, making text lowercase, etc. (Like your TextCleaner).\nFeature extraction: Converting text into numerical features that the model can understand (Like your TfidfVectorizer).\nTraining a model: The algorithm that learns from the data (Like your LogisticRegression).\nA Pipeline strings these steps together so you don\'t have to manually apply each step to your data one by one. When you call .fit() on the pipeline, it fits each step in order on the training data. When you call .predict() or .transform() on the pipeline, it applies each step in order to the new data.\n\nThis makes your