In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

Load dataset

In [5]:
data = pd.read_csv('career_dataset.csv')
print(data.head())
print(data.info())

   CandidateID           Name  Age   Education  \
0            1       John Doe   28  Bachelor's   
1            2     Jane Smith   32    Master's   
2            3    Bob Johnson   24  Bachelor's   
3            4    Emily Davis   26  Bachelor's   
4            5  Michael Brown   30    Master's   

                                      Skills                Interests  \
0      Python;Data Analysis;Machine Learning  Technology;Data Science   
1         Java;System Design;Cloud Computing  Software Development;AI   
2  Graphic Design;UI/UX;Adobe Creative Suite       Arts;Digital Media   
3            Python;Deep Learning;Statistics            Healthcare;AI   
4     Project Management;Communication;Agile      Business;Management   

  Recommended_Career  Recommendation_Score  
0     Data Scientist                  0.95  
1  Software Engineer                  0.90  
2        UX Designer                  0.88  
3      AI Researcher                  0.93  
4    Project Manager               

Preprocessing

In [6]:
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer

In [7]:
data = data.drop(['CandidateID', 'Name'], axis=1)

In [8]:
le_edu = LabelEncoder()
data['Education'] = le_edu.fit_transform(data['Education'])

In [9]:
mlb_skills = MultiLabelBinarizer()
data_skills = mlb_skills.fit_transform(data['Skills'].str.split(';'))
skills_df = pd.DataFrame(data_skills, columns=mlb_skills.classes_)
data = pd.concat([data, skills_df], axis=1)
data = data.drop('Skills', axis=1)
mlb_interests = MultiLabelBinarizer()
data_interests = mlb_interests.fit_transform(data['Interests'].str.split(';'))
interests_df = pd.DataFrame(data_interests, columns=mlb_interests.classes_)
data = pd.concat([data, interests_df], axis=1)
data = data.drop('Interests', axis=1)

Features and Target

In [10]:
X = data.drop(['Recommended_Career', 'Recommendation_Score'], axis=1)
le_career = LabelEncoder()
y = le_career.fit_transform(data['Recommended_Career'])


Train-Test Split

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Model Train

In [12]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


Evaluate the Model

In [14]:
import numpy as np
unique_test_classes = np.unique(y_test)
print(unique_test_classes)
print([le_career.classes_[i] for i in unique_test_classes])


[ 2  3  4  5  6  7  8 10 12 13 15 16 17 18 19 20 21 23 24 25 26 27 28 30
 31]
['Automation Engineer', 'Backend Developer', 'Biostatistician', 'Business Analyst', 'Cloud Engineer', 'Content Strategist', 'Cybersecurity Analyst', 'Data Analyst', 'Data Scientist', 'Deep Learning Engineer', 'Digital Marketer', 'Embedded Systems Engineer', 'Financial Analyst', 'Front-end Developer', 'Full Stack Developer', 'Graphic Designer', 'Machine Learning Engineer', 'Mobile Developer', 'NLP Engineer', 'Project Manager', 'Research Analyst', 'Research Scientist', 'Software Developer', 'UX Designer', 'UX Researcher']


In [15]:
from sklearn.metrics import classification_report
labels = unique_test_classes
target_names = [le_career.classes_[i] for i in labels]
print(classification_report(y_test, y_pred, labels=labels, target_names=target_names))


                           precision    recall  f1-score   support

      Automation Engineer       0.50      1.00      0.67         1
        Backend Developer       0.50      1.00      0.67         1
          Biostatistician       0.00      0.00      0.00         2
         Business Analyst       1.00      0.50      0.67         2
           Cloud Engineer       0.00      0.00      0.00         1
       Content Strategist       0.00      0.00      0.00         1
    Cybersecurity Analyst       1.00      0.33      0.50         3
             Data Analyst       0.00      0.00      0.00         4
           Data Scientist       1.00      0.50      0.67         2
   Deep Learning Engineer       0.00      0.00      0.00         1
         Digital Marketer       1.00      0.60      0.75         5
Embedded Systems Engineer       0.33      1.00      0.50         1
        Financial Analyst       0.67      1.00      0.80         2
      Front-end Developer       1.00      1.00      1.00     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Make Predictions

In [17]:
new_candidate = {}
new_candidate['Age'] = 25
new_candidate['Education'] = le_edu.transform(['Bachelor\'s'])[0]

# For each skill column, set 1 if the candidate has it, else 0
skills = ['Python', 'Java', 'Graphic Design', ...]  # All skill columns from X
candidate_skills = ['Python', 'Graphic Design']     # Skills for this candidate
for skill in skills:
    new_candidate[skill] = 1 if skill in candidate_skills else 0

# For each interest column, set 1 if present, else 0
interests = ['AI', 'Data Science', ...]             # All interest columns from X
candidate_interests = ['AI']                        # Interests for this candidate
for interest in interests:
    new_candidate[interest] = 1 if interest in candidate_interests else 0


In [19]:
# Get all feature columns from your training set
feature_columns = X.columns.tolist()

# Candidate's actual info
candidate_info = {
    'Age': 25,
    'Education': le_edu.transform(['Bachelor\'s'])[0],
    # List all skills/interests the candidate has
    'skills': ['Python', 'Data Analysis'],
    'interests': ['Technology', 'Data Science']
}

# Build the full feature dictionary
new_candidate = {}
for col in feature_columns:
    if col == 'Age':
        new_candidate[col] = candidate_info['Age']
    elif col == 'Education':
        new_candidate[col] = candidate_info['Education']
    elif col in candidate_info['skills']:
        new_candidate[col] = 1
    elif col in candidate_info['interests']:
        new_candidate[col] = 1
    else:
        # For all other skill/interest columns not present, set to 0
        new_candidate[col] = 0

# Create DataFrame and ensure column order matches
new_df = pd.DataFrame([new_candidate])
new_df = new_df[X.columns]  # This should now work!


In [20]:
print("Expected columns:", X.columns.tolist())
print("Candidate keys:", new_candidate.keys())


Expected columns: ['Age', 'Education', '.NET', 'AI', 'Adobe Creative Suite', 'Adobe Illustrator', 'Adobe Photoshop', 'Adobe XD', 'Agile', 'Algorithms', 'Analytics', 'Android', 'Automation', 'Big Data', 'Business Analysis', 'C#', 'C++', 'CRM', 'CSS', 'Cloud Computing', 'Communication', 'Content Creation', 'Content Strategy', 'Content Writing', 'Copywriting', 'Creativity', 'Cybersecurity', 'Data Analysis', 'Data Mining', 'Data Science', 'Data Structures', 'Data Visualization', 'Data Warehousing', 'Deep Learning', 'DevOps', 'Digital Illustration', 'Digital Marketing', 'ETL', 'Econometrics', 'Embedded Systems', 'Excel', 'Financial Analysis', 'Graphic Design', 'HTML', 'Illustration', 'Interaction Design', 'IoT', 'Java', 'JavaScript', 'Linux', 'Machine Learning', 'Marketing Strategy', 'Microservices', 'NLP', 'Natural Language Processing', 'Negotiation', 'Network Security', 'Node.js', 'Project Management', 'Prototyping', 'Python', 'R', 'React', 'Research', 'Risk Analysis', 'SEO', 'SQL', 'Soci

In [21]:
pred = model.predict(new_df)
print('Recommended Career:', le_career.inverse_transform(pred)[0])


Recommended Career: Data Analyst


Accuracy

In [22]:
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)


Accuracy: 0.475


Precision, Recall, and F1 Score

In [23]:
from sklearn.metrics import precision_score, recall_score, f1_score
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)


Precision: 0.5333333333333333
Recall: 0.475
F1 Score: 0.45041666666666663


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report

In [25]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           2       0.50      1.00      0.67         1
           3       0.50      1.00      0.67         1
           4       0.00      0.00      0.00         2
           5       1.00      0.50      0.67         2
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1
           8       1.00      0.33      0.50         3
          10       0.00      0.00      0.00         4
          11       0.00      0.00      0.00         0
          12       1.00      0.50      0.67         2
          13       0.00      0.00      0.00         1
          14       0.00      0.00      0.00         0
          15       1.00      0.60      0.75         5
          16       0.33      1.00      0.50         1
          17       0.67      1.00      0.80         2
          18       1.00      1.00      1.00         1
          19       0.33      1.00      0.50         1
          20       1.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Confusion Matrix

In [26]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)


[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 3 1 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0