In [1]:
from google.colab import files
uploaded = files.upload()


Saving UpdatedResumeDataSet.csv.zip to UpdatedResumeDataSet.csv.zip


In [2]:
import zipfile, os

# make directory to store extracted data
os.makedirs("data/resume-dataset", exist_ok=True)

# extract zip
with zipfile.ZipFile("UpdatedResumeDataSet.csv.zip", "r") as zip_ref:
    zip_ref.extractall("data/resume-dataset")

# list files to verify extraction
os.listdir("data/resume-dataset")


['UpdatedResumeDataSet.csv']

In [3]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# download NLTK data (first time only)
nltk.download('punkt')
nltk.download('stopwords')

# load dataset
df = pd.read_csv("data/resume-dataset/UpdatedResumeDataSet.csv")

# check columns
print("Columns:", df.columns.tolist())
print("\nSample data:\n", df.head(2))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Columns: ['Category', 'Resume']

Sample data:
        Category                                             Resume
0  Data Science  Skills * Programming Languages: Python (pandas...
1  Data Science  Education Details \r\nMay 2013 to May 2017 B.E...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
import nltk
nltk.download('punkt_tab')



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [5]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    # remove emails, urls, non-alphabets
    text = re.sub(r'\S+@\S+', ' ', str(text))
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = text.lower()
    # tokenize and remove stopwords
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in STOPWORDS and len(t) > 2]
    return ' '.join(tokens)

# Apply cleaning function
df['cleaned_resume'] = df['Resume'].apply(clean_text)

# Show before vs after example
print("Before:\n", df['Resume'][0][:300])
print("\nAfter Cleaning:\n", df['cleaned_resume'][0][:300])

# Check how many resumes cleaned
print("\nTotal resumes cleaned:", len(df))


Before:
 Skills * Programming Languages: Python (pandas, numpy, scipy, scikit-learn, matplotlib), Sql, Java, JavaScript/JQuery. * Machine learning: Regression, SVM, NaÃ¯ve Bayes, KNN, Random Forest, Decision Trees, Boosting techniques, Cluster Analysis, Word Embedding, Sentiment Analysis, Natural Language pr

After Cleaning:
 skills programming languages python pandas numpy scipy scikit learn matplotlib sql java javascript jquery machine learning regression svm bayes knn random forest decision trees boosting techniques cluster analysis word embedding sentiment analysis natural language processing dimensionality reduction

Total resumes cleaned: 962


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=1000)  # limit features for simplicity

# Fit and transform the cleaned resumes
X = vectorizer.fit_transform(df['cleaned_resume'])

# Labels
y = df['Category']

print("Feature matrix shape:", X.shape)
print("Sample feature names:", vectorizer.get_feature_names_out()[:10])


Feature matrix shape: (962, 1000)
Sample feature names: ['ability' 'acceptance' 'access' 'according' 'account' 'accounting'
 'accounts' 'achieve' 'achieved' 'achieving']


In [7]:
from sklearn.model_selection import train_test_split

# Split data: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])


Training samples: 769
Testing samples: 193


In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train model
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9637305699481865

Classification Report:
                            precision    recall  f1-score   support

                 Advocate       1.00      0.33      0.50         3
                     Arts       1.00      1.00      1.00         6
       Automation Testing       1.00      0.40      0.57         5
               Blockchain       1.00      1.00      1.00         7
         Business Analyst       1.00      1.00      1.00         4
           Civil Engineer       1.00      1.00      1.00         9
             Data Science       1.00      1.00      1.00         5
                 Database       1.00      1.00      1.00         8
          DevOps Engineer       1.00      0.93      0.96        14
         DotNet Developer       1.00      1.00      1.00         5
            ETL Developer       1.00      1.00      1.00         7
   Electrical Engineering       1.00      1.00      1.00         6
                       HR       1.00      0.92      0.96        12
       

In [9]:
import joblib

# Save the trained model
joblib.dump(model, "resume_classifier_model.pkl")

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print("Model and vectorizer saved successfully!")


Model and vectorizer saved successfully!


In [10]:
# Load the saved model and vectorizer
model = joblib.load("resume_classifier_model.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")

# Example: New resume text
new_resume = """
Experienced Python developer with expertise in machine learning, data analysis,
and web development. Skilled in pandas, numpy, scikit-learn, and Django framework.
"""

# Clean the text (same function as before)
cleaned_resume = clean_text(new_resume)

# Transform using TF-IDF
X_new = vectorizer.transform([cleaned_resume])

# Predict category
predicted_category = model.predict(X_new)
print("Predicted Category:", predicted_category[0])


Predicted Category: Data Science
