In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv('job_skills_dataset_corrected.csv')

# Show basic info
print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Job Title        5000 non-null   object
 1   Job Category     5000 non-null   object
 2   Company          5000 non-null   object
 3   Salary           5000 non-null   object
 4   Skills Required  5000 non-null   object
dtypes: object(5)
memory usage: 195.4+ KB
None
                      Job Title       Job Category    Company  Salary  \
0  Digital Marketing Specialist  Digital Marketing       Meta  12 LPA   
1              Business Analyst  Business Analysis    Infosys  10 LPA   
2  Digital Marketing Specialist  Digital Marketing      Cisco   8 LPA   
3               DevOps Engineer             DevOps  Microsoft  24 LPA   
4              Business Analyst  Business Analysis        IBM  10 LPA   

                                     Skills Required  
0  SEM, Email Marketing, Content Marke

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Clean and normalize skills
df['Skills Required'] = df['Skills Required'].str.lower().str.replace(r'[^\w\s,]', '', regex=True)

# Feature and target
X_raw = df['Skills Required']
y_raw = df['Job Title']

# Vectorize skills
vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(','))
X = vectorizer.fit_transform(X_raw)

# Encode job titles
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




In [3]:
X_train.shape, X_test.shape, len(label_encoder.classes_)

((4000, 117), (1000, 117), 12)

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Train Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

# Predict on test set
y_pred = lr_model.predict(X_test)

# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
conf_matrix = confusion_matrix(y_test, y_pred)

accuracy, report, conf_matrix


(0.999,
 '                              precision    recall  f1-score   support\n\n                 AI Engineer       1.00      1.00      1.00        86\n           Backend Developer       1.00      1.00      1.00        87\n            Business Analyst       1.00      1.00      1.00        84\n              Cloud Engineer       0.98      1.00      0.99        64\n                Data Analyst       1.00      1.00      1.00        87\n      Database Administrator       1.00      1.00      1.00        79\n             DevOps Engineer       1.00      0.99      0.99        73\nDigital Marketing Specialist       1.00      1.00      1.00        78\n          Frontend Developer       1.00      1.00      1.00        84\n              Java Developer       1.00      1.00      1.00        83\n            Security Analyst       1.00      1.00      1.00       105\n       Senior Data Scientist       1.00      1.00      1.00        90\n\n                    accuracy                           1.00    

In [5]:
# Define a test function to predict job title from a custom skill string
def predict_job_title(skill_string):
    skill_string = skill_string.lower()
    skill_vector = vectorizer.transform([skill_string])
    prediction = lr_model.predict(skill_vector)
    job_title = label_encoder.inverse_transform(prediction)[0]
    return job_title

# Test with a few custom skill inputs
test_inputs = [
    "AWS, Docker, Kubernetes, CI/CD, Linux",
    "SEO, Google Ads, Email Marketing, Content Writing",
    "Power BI, Business Analysis, SQL, Requirement Gathering",
    "React, JavaScript, CSS, HTML",
    "Python, Machine Learning, Deep Learning, NLP"
]

predictions = {skills: predict_job_title(skills) for skills in test_inputs}
predictions


{'AWS, Docker, Kubernetes, CI/CD, Linux': 'Cloud Engineer',
 'SEO, Google Ads, Email Marketing, Content Writing': 'Digital Marketing Specialist',
 'Power BI, Business Analysis, SQL, Requirement Gathering': 'Business Analyst',
 'React, JavaScript, CSS, HTML': 'Frontend Developer',
 'Python, Machine Learning, Deep Learning, NLP': 'AI Engineer'}

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Train Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_acc = accuracy_score(y_test, rf_model.predict(X_test))

# Train SVM
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_acc = accuracy_score(y_test, svm_model.predict(X_test))

print(f"Logistic Regression Accuracy: {accuracy}")
print(f"Random Forest Accuracy: {rf_acc}")
print(f"SVM Accuracy: {svm_acc}")


Logistic Regression Accuracy: 0.999
Random Forest Accuracy: 0.999
SVM Accuracy: 0.999


In [7]:
import joblib
# Replace this cell:
# vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(','))

# With this:
from utils import comma_tokenizer


vectorizer = TfidfVectorizer(tokenizer=comma_tokenizer)
vectorizer.fit(X_raw) 
# Save vectorizer and label encoder
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')

# Save all models
joblib.dump(lr_model, 'logistic_regression_model.pkl')
joblib.dump(rf_model, 'random_forest_model.pkl')
joblib.dump(svm_model, 'svm_model.pkl')



['svm_model.pkl']

In [8]:
import pandas as pd

# Load both datasets
df1 = pd.read_csv("job_skills_dataset_corrected.csv")
df2 = pd.read_csv("learning_resources_dataset.csv")

skill_set = set()

# Extract from job_skills_dataset_corrected.csv
for row in df1["Skills Required"]:
    for skill in str(row).split(","):
        cleaned = skill.strip().lower()
        if cleaned:
            skill_set.add(cleaned)

# Attempt from learning_resources_dataset.csv if skill-related column exists
for col in df2.columns:
    if "skill" in col.lower():
        for entry in df2[col]:
            for skill in str(entry).split(","):
                cleaned = skill.strip().lower()
                if cleaned:
                    skill_set.add(cleaned)

# Save as unique_skills.txt
with open("unique_skills.txt", "w") as f:
    for skill in sorted(skill_set):
        f.write(skill + "\n")

print("✅ Skill list generated with", len(skill_set), "unique entries.")


✅ Skill list generated with 66 unique entries.
