In [7]:
# import necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
# load the dataset
df = pd.read_csv('../data/train/high_school_career_recommendation_dataset.csv')
df.head()
print("Columns:", df.columns.tolist())
print("Unique Education values:", df['Education'].unique())


Columns: ['CandidateID', 'Age', 'Education', 'GPA', 'Interest', 'Favorite_Subject', 'Extracurriculars', 'Personality_Trait', 'Recommended_Career']
Unique Education values: ['O-Level' 'A-Level']


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import joblib

# Split data into features and target
X = df.drop('Recommended_Career', axis=1)
y = df['Recommended_Career']

# One-hot encoding for categorical features
X = pd.get_dummies(X, columns=['Education', 'Interest', 'Favorite_Subject',
                       'Extracurriculars', 'Personality_Trait'])

# Splitting into train and test
x_train, x_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

#Second split to get validation and test data
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

# Scaling features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)
x_test_scaled = scaler.transform(x_test)

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)
y_test_encoded = label_encoder.transform(y_test)

joblib.dump(scaler, '../models/scaler.pkl')
joblib.dump(label_encoder, '../models/label_encoder.pkl')

print('Training data shape', x_train_scaled.shape)
print('Validation data shape', x_val_scaled.shape)
print('Test data shape', x_test_scaled.shape)

Training data shape (600, 51)
Validation data shape (200, 51)
Test data shape (200, 51)


In [10]:
# logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib
import os

logistic_model = LogisticRegression(
    multi_class='multinomial', #for multiclass classification
    solver='lbfgs',
    max_iter=1000,
    random_state=42
)


logistic_model.fit(x_train_scaled, y_train_encoded)
joblib.dump(logistic_model, '../models/logistic_regression_model.pkl')
joblib.dump(x_train.columns, '../models/scaler.pkl.columns')

y_pred_train = logistic_model.predict(x_train_scaled)
y_pred_test = logistic_model.predict(x_test_scaled)

#Evaluate model
print(classification_report(y_train_encoded, y_pred_train, target_names=label_encoder.classes_))
print(classification_report(y_test_encoded, y_pred_test, target_names=label_encoder.classes_))



                         precision    recall  f1-score   support

             Accountant       1.00      1.00      1.00         6
              Architect       0.84      0.87      0.86        31
                 Artist       0.78      0.70      0.74        20
       Business Manager       1.00      0.97      0.98        31
                   Chef       1.00      1.00      1.00         1
                 Doctor       0.79      0.73      0.76        15
               Engineer       0.82      0.78      0.80        18
Environmental Scientist       0.82      0.86      0.84        21
      Financial Analyst       0.91      0.94      0.93        33
       Graphic Designer       0.90      0.93      0.92        29
             Journalist       0.97      0.92      0.94        36
                 Lawyer       0.96      0.96      0.96        25
   Marketing Specialist       1.00      0.95      0.97        40
               Musician       1.00      1.00      1.00        24
                  Nurse 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
def make_predictions(model_path, X):

    # Load the model
    model = joblib.load(model_path)
    # Make predictions
    predictions = model.predict(X)
    # Convert probabilities to binary labels (0 or 1)

    return predictions


In [12]:
model_path = '../models/logistic_regression_model.pkl'
make_predictions(model_path, x_test_scaled)

array([12, 16,  2,  7, 16, 18, 16, 16, 10, 16, 12, 15, 18,  1,  5, 16, 18,
       18,  3, 18, 18, 18, 13,  3, 17,  1,  8,  3, 18, 17,  9, 15, 18, 16,
       16, 18, 10, 16, 18,  8, 15, 18, 13, 18,  7, 18,  3, 18, 17, 18, 18,
        8, 16, 10, 18, 12,  7, 12, 10, 10, 16,  1, 18,  7,  8,  6,  1, 12,
       18, 10, 18, 10,  5,  2, 16,  8,  3, 18, 16,  7, 18, 13, 15,  9,  8,
       11, 18,  5, 17, 16,  1, 10,  3,  3,  3,  3, 12,  8, 16, 12,  6, 15,
       10,  7,  8,  5, 17, 18,  7,  5,  9, 18, 18,  8,  7, 18, 18, 16,  6,
       18, 16, 12,  8,  3, 18, 17, 12, 16,  7, 18, 10,  8,  3,  3,  1,  9,
       18,  8, 14, 14, 18, 18, 16,  2,  3, 10,  6,  5,  3, 16,  9,  7, 18,
        8, 18, 18,  3, 10, 19, 11, 16, 18, 15, 19, 11, 19,  9,  8, 18, 18,
       15,  7, 16, 10, 13, 17, 19,  8,  9, 18, 18, 13,  8,  6, 10, 16,  0,
        5, 10, 18, 17, 18,  3, 10, 16, 12, 18, 18,  8,  3])