### Import all necessary libraries

In [68]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

### Read/Load the dataset

In [69]:
data = pd.read_csv("job_dataset.csv")
print(data.shape)
print(data.columns)

(19000, 11)
Index(['Title', 'Experience', 'Location', 'Skill_1', 'Skill_2', 'Skill_3',
       'Skill_4', 'Skill_5', 'Skill_6', 'Skill_7', 'Skill_8'],
      dtype='object')


### Combine all skills and define features(X)

In [70]:
data['Skills'] = data[['Skill_1', 'Skill_2', 'Skill_3', 'Skill_4', 'Skill_5', 'Skill_6', 'Skill_7', 'Skill_8']].apply(lambda x: ' '.join(x.astype(str)), axis=1)
X = data['Skills']

### Define target(y) columns

In [71]:
y = data['Title']

### Splitting the data into train and test sets

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Apply TF-IDF (Term Frequency-Inverse Document Frequency) to transform the skills into numerical features.

In [73]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

### Convert target column to numeric values

In [74]:
y_train_numeric = y_train.astype('category').cat.codes
y_test_numeric = y_test.astype('category').cat.codes

### Training a Random Forest Classifier Model

In [75]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train_numeric)

### Make predictions on test set and evaluate accuracy

In [76]:
y_pred = rf_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test_numeric, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 100.00%


### Getting unique skills 

In [77]:
skill_cols = ['Skill_1','Skill_2','Skill_3','Skill_4','Skill_5','Skill_6','Skill_7','Skill_8']
all_skills = pd.concat([data[skill] for skill in skill_cols]).dropna()
unique_skills_list = list(all_skills.unique())
print(unique_skills_list)

['Git', 'Data Structures', 'Algorithms', 'OOP', 'Java', 'Python', 'SQL', 'Excel', 'Power BI', 'Statistics', 'Tableau', 'Visualization', 'NLP', 'PyTorch', 'Scikit-learn', 'TensorFlow', 'Computer Vision', 'Deep Learning', 'Penetration Testing', 'Firewalls', 'Linux', 'Network Security', 'Ethical Hacking', 'Incident Response', 'Kubernetes', 'AWS', 'Docker', 'Terraform', 'Networking', 'DevOps', 'Google Cloud', 'Azure', 'HTML', 'React', 'MongoDB', 'JavaScript', 'CSS', 'Express.js', 'Node.js', 'Express', 'Forecasting', 'Budgeting', 'Financial Modeling', 'Investment Analysis', 'Private Equity', 'Mergers', 'Risk Analysis', 'IPO', 'Corporate Finance', 'SEO', 'PPC', 'Google Analytics', 'Social Media', 'Google Ads', 'Email Marketing', 'Conflict Resolution', 'Payroll', 'Employee Relations', 'Training', 'Onboarding', 'HR Policies', 'Recruitment', 'Surgery', 'Pathology', 'Diagnosis', 'Emergency Medicine', 'Patient Care', 'Public Health', 'Lesson Planning', 'Classroom Management', 'Student Assessment'

### Defining function to make prediction using model

In [80]:
def predictTitle(new_data):
    filtered_data = [skill if skill in unique_skills_list else None for skill in new_data]
    
    missing_skills = [skill for skill, filtered in zip(new_data, filtered_data) if filtered is None]
    if missing_skills:
        print(f"The following skills are not present in the dataset and will be ignored: {missing_skills}")
    
    new_data_format = {
        'Skill_1': filtered_data[0] if filtered_data[0] is not None else '',
        'Skill_2': filtered_data[1] if filtered_data[1] is not None else '',
        'Skill_3': filtered_data[2] if filtered_data[2] is not None else '',
        'Skill_4': filtered_data[3] if filtered_data[3] is not None else '',
        'Skill_5': filtered_data[4] if filtered_data[4] is not None else '',
        'Skill_6': filtered_data[5] if filtered_data[5] is not None else '',
        'Skill_7': filtered_data[6] if filtered_data[6] is not None else '',
        'Skill_8': filtered_data[7] if filtered_data[7] is not None else ''
    }
    
    new_data_combined = ' '.join(new_data_format.values())
    
    new_data_tfidf = tfidf.transform([new_data_combined])
    
    predicted_label_numeric = rf_model.predict(new_data_tfidf)
    
    predicted_label = y_train.astype('category').cat.categories[predicted_label_numeric[0]]
    
    return predicted_label

In [89]:
def predictTop3Titles(new_data):
    # Filter out skills not present in the unique_skills_list
    filtered_data = [skill if skill in unique_skills_list else None for skill in new_data]
    
    # Check for missing skills and inform the user
    missing_skills = [skill for skill, filtered in zip(new_data, filtered_data) if filtered is None]
    if missing_skills:
        print(f"The following skills are not present in the dataset and will be ignored: {missing_skills}")
    
    # Prepare the new data in the required format (replace None with an empty string for missing skills)
    new_data_format = {
        'Skill_1': filtered_data[0] if filtered_data[0] is not None else '',
        'Skill_2': filtered_data[1] if filtered_data[1] is not None else '',
        'Skill_3': filtered_data[2] if filtered_data[2] is not None else '',
        'Skill_4': filtered_data[3] if filtered_data[3] is not None else '',
        'Skill_5': filtered_data[4] if filtered_data[4] is not None else '',
        'Skill_6': filtered_data[5] if filtered_data[5] is not None else '',
        'Skill_7': filtered_data[6] if filtered_data[6] is not None else '',
        'Skill_8': filtered_data[7] if filtered_data[7] is not None else ''
    }
    
    # Combine the skill columns into a single string
    new_data_combined = ' '.join(new_data_format.values())
    
    # Transform the new data using TF-IDF
    new_data_tfidf = tfidf.transform([new_data_combined])
    
    # Get the probabilities for all job titles
    predicted_probs = rf_model.predict_proba(new_data_tfidf)
    
    # Get the top 3 job titles with the highest probabilities
    top_3_indices = predicted_probs[0].argsort()[-3:][::-1]
    top_3_titles = y_train.astype('category').cat.categories[top_3_indices]
    top_3_probs = predicted_probs[0][top_3_indices] * 100  # Convert to percentage
    
    # Format the output to show percentages
    top_3_with_percentage = [(title, f"{prob:.2f}%") for title, prob in zip(top_3_titles, top_3_probs)]
    
    # Return the top 3 titles along with their probabilities
    return top_3_with_percentage


### Testing using different inputs

In [90]:
skills = [
    'Python', 
    'SQL', 
    'Machine Learning', 
    'Deep Learning', 
    'TensorFlow', 
    'NLP', 
    'Statistics', 
    'R'
]
print(f"Predicted title: {predictTop3Titles(skills)}")

The following skills are not present in the dataset and will be ignored: ['Machine Learning', 'R']
Predicted title: [('Machine Learning Engineer', '42.00%'), ('Software Engineer', '14.00%'), ('Data Analyst', '11.00%')]


In [91]:
skills = [
    'AutoCAD', 
    'SolidWorks', 
    'Finite Element Analysis (FEA)', 
    'Matlab', 
    'ANSYS', 
    '3D Modeling', 
    'Mechanical Design', 
    'Thermodynamics'
]
print(f"Predicted title: {predictTop3Titles(skills)}")

The following skills are not present in the dataset and will be ignored: ['Finite Element Analysis (FEA)', 'Matlab', 'ANSYS', '3D Modeling']
Predicted title: [('Mechanical Engineer', '36.00%'), ('Full Stack Developer', '10.00%'), ('Cloud Engineer', '8.00%')]


In [92]:
skills = [
    'SEO', 
    'Google Analytics', 
    'Content Marketing', 
    'Social Media Marketing', 
    'Email Marketing', 
    'PPC Advertising', 
    'Market Research', 
    'Brand Management'
]
print(f"Predicted title: {predictTop3Titles(skills)}")

The following skills are not present in the dataset and will be ignored: ['Content Marketing', 'Social Media Marketing', 'PPC Advertising', 'Market Research']
Predicted title: [('Marketing Manager', '37.00%'), ('Cloud Engineer', '12.00%'), ('Hotel Manager', '8.00%')]


In [84]:
skills = [
    'Recruitment', 
    'Talent Management', 
    'Employee Relations', 
    'Onboarding', 
    'Compensation & Benefits', 
    'HRIS', 
    'Performance Management', 
    'Conflict Resolution'
]

print(f"Predicted title: {predictTitle(skills)}")

The following skills are not present in the dataset and will be ignored: ['Talent Management', 'Compensation & Benefits', 'HRIS']
Predicted title: HR Manager


In [85]:
skills = [
    'Pharmaceutical Care', 
    'Medication Therapy Management', 
    'Pharmacology', 
    'Patient Counseling', 
    'Prescription Dispensing', 
    'Drug Interactions', 
    'Clinical Pharmacolog', 
    'Inventory Management'
]
print(f"Predicted title: {predictTitle(skills)}")

The following skills are not present in the dataset and will be ignored: ['Pharmaceutical Care', 'Medication Therapy Management', 'Patient Counseling', 'Prescription Dispensing', 'Drug Interactions', 'Clinical Pharmacolog', 'Inventory Management']
Predicted title: Doctor
