### Import all necessary libraries

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

### Read/Load the dataset

In [2]:
data = pd.read_csv("job_dataset.csv")
print(data.shape)
print(data.columns)

(19000, 11)
Index(['Title', 'Experience', 'Location', 'Skill_1', 'Skill_2', 'Skill_3',
       'Skill_4', 'Skill_5', 'Skill_6', 'Skill_7', 'Skill_8'],
      dtype='object')


### Combine all skills and define features(X)

In [3]:
data['Skills'] = data[['Skill_1', 'Skill_2', 'Skill_3', 'Skill_4', 'Skill_5', 'Skill_6', 'Skill_7', 'Skill_8']].apply(lambda x: ' '.join(x.astype(str)), axis=1)
X = data['Skills']

### Define target(y) columns

In [4]:
y = data['Title']

### Splitting the data into train and test sets

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Apply TF-IDF (Term Frequency-Inverse Document Frequency) to transform the skills into numerical features.

In [6]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

### Convert target column to numeric values

In [7]:
y_train_numeric = y_train.astype('category').cat.codes
y_test_numeric = y_test.astype('category').cat.codes

### Training a Random Forest Classifier Model

In [8]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train_numeric)

### Make predictions on test set and evaluate accuracy

In [12]:
y_pred = rf_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test_numeric, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 100.00%


### Defining function to make prediction using model

In [20]:
def predictTitle(new_data):
    new_data_format = {
        'Skill_1': new_data[0],
        'Skill_2': new_data[1],
        'Skill_3': new_data[2],
        'Skill_4': new_data[3],
        'Skill_5': new_data[4],
        'Skill_6': new_data[5],
        'Skill_7': new_data[6],
        'Skill_8': new_data[7]
    }
    
    new_data_combined = ' '.join(new_data_format.values())
    
    new_data_tfidf = tfidf.transform([new_data_combined])
    
    predicted_label_numeric = rf_model.predict(new_data_tfidf)
    
    predicted_label = y_train.astype('category').cat.categories[predicted_label_numeric[0]]
    
    return predicted_label

### Testing using different inputs

In [38]:
skills = [
    'Python', 
    'SQL', 
    'Machine Learning', 
    'Deep Learning', 
    'TensorFlow', 
    'NLP', 
    'Statistics', 
    'R'
]
print(f"Predicted title: {predictTitle(skills)}")

Predicted title: Machine Learning Engineer


In [40]:
skills = [
    'AutoCAD', 
    'SolidWorks', 
    'Finite Element Analysis (FEA)', 
    'Matlab', 
    'ANSYS', 
    '3D Modeling', 
    'Mechanical Design', 
    'Thermodynamics'
]
print(f"Predicted title: {predictTitle(skills)}")

Predicted title: Mechanical Engineer


In [44]:
skills = [
    'SEO', 
    'Google Analytics', 
    'Content Marketing', 
    'Social Media Marketing', 
    'Email Marketing', 
    'PPC Advertising', 
    'Market Research', 
    'Brand Management'
]
print(f"Predicted title: {predictTitle(skills)}")

Predicted title: Marketing Manager


In [47]:
skills = [
    'Recruitment', 
    'Talent Management', 
    'Employee Relations', 
    'Onboarding', 
    'Compensation & Benefits', 
    'HRIS', 
    'Performance Management', 
    'Conflict Resolution'
]

print(f"Predicted title: {predictTitle(skills)}")

Predicted title: HR Manager


In [48]:
skills = [
    'Pharmaceutical Care', 
    'Medication Therapy Management', 
    'Pharmacology', 
    'Patient Counseling', 
    'Prescription Dispensing', 
    'Drug Interactions', 
    'Clinical Pharmacology', 
    'Inventory Management'
]
print(f"Predicted title: {predictTitle(skills)}")

Predicted title: Doctor
