In [23]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [24]:
# Load the dataset
df = pd.read_csv("cs_students.csv")
df.head()

Unnamed: 0,Student ID,Name,Gender,Age,GPA,Major,Interested Domain,Projects,Future Career,Python,SQL,Java
0,1,John Smith,Male,21,3.5,Computer Science,Artificial Intelligence,Chatbot Development,Machine Learning Researcher,Strong,Strong,Weak
1,2,Alice Johnson,Female,20,3.2,Computer Science,Data Science,Data Analytics,Data Scientist,Average,Strong,Weak
2,3,Robert Davis,Male,22,3.8,Computer Science,Software Development,E-commerce Website,Software Engineer,Strong,Strong,Average
3,4,Emily Wilson,Female,21,3.7,Computer Science,Web Development,Full-Stack Web App,Web Developer,Weak,Strong,Strong
4,5,Michael Brown,Male,23,3.4,Computer Science,Cybersecurity,Network Security,Information Security Analyst,Average,Weak,Strong


In [25]:
# Clean the dataset
df = df.drop(columns=["Student ID", "Name"])
df.head()

Unnamed: 0,Gender,Age,GPA,Major,Interested Domain,Projects,Future Career,Python,SQL,Java
0,Male,21,3.5,Computer Science,Artificial Intelligence,Chatbot Development,Machine Learning Researcher,Strong,Strong,Weak
1,Female,20,3.2,Computer Science,Data Science,Data Analytics,Data Scientist,Average,Strong,Weak
2,Male,22,3.8,Computer Science,Software Development,E-commerce Website,Software Engineer,Strong,Strong,Average
3,Female,21,3.7,Computer Science,Web Development,Full-Stack Web App,Web Developer,Weak,Strong,Strong
4,Male,23,3.4,Computer Science,Cybersecurity,Network Security,Information Security Analyst,Average,Weak,Strong


In [26]:
# Encode categorical features
label_encoders = {}
for col in df.drop(columns=["Future Career"]).select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [27]:
# Encode and filter the target variable
original_labels = df["Future Career"].copy()
target_encoder = LabelEncoder()
df["Future Career"] = target_encoder.fit_transform(original_labels)

class_counts = df["Future Career"].value_counts()
valid_classes = class_counts[class_counts > 1].index
df = df[df["Future Career"].isin(valid_classes)]

filtered_labels = original_labels[df.index]
target_encoder = LabelEncoder()
df["Future Career"] = target_encoder.fit_transform(filtered_labels)

In [28]:
# Split features and target
X = df.drop(columns=["Future Career"])
y = df["Future Career"]

In [29]:
# Select the top 7 most important features
rf_temp = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_temp.fit(X, y)
importances = pd.Series(rf_temp.feature_importances_, index=X.columns).sort_values(ascending=False)
top_features = importances.head(7).index.tolist()
print("Top 7 features:", top_features)

Top 7 features: ['Interested Domain', 'Projects', 'GPA', 'Python', 'SQL', 'Age', 'Java']


In [30]:
# Exclude 'Age' to prevent bias in recommendations
top_features.remove('Age')
print("Top features:", top_features)

Top features: ['Interested Domain', 'Projects', 'GPA', 'Python', 'SQL', 'Java']


In [31]:
# Split the dataset into training and testing sets using stratified sampling
X_train, X_test, y_train, y_test = train_test_split(
    df[top_features], y, test_size=0.2, random_state=42, stratify=y
)

In [32]:
# Handle class imbalance with oversampling
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

In [33]:
# Train the Random Forest model
model = RandomForestClassifier(
    n_estimators=100,
    min_samples_split=10,
    min_samples_leaf=10,
    random_state=42, 
    class_weight='balanced'
)
model.fit(X_train_resampled, y_train_resampled)

In [34]:
# Evaluate model accuracy
train_acc = accuracy_score(y_train_resampled, model.predict(X_train_resampled))
test_acc = accuracy_score(y_test, model.predict(X_test))

print(f"\nTrain Accuracy: {train_acc:.2f}")
print(f"Test Accuracy: {test_acc:.2f}")


Train Accuracy: 0.96
Test Accuracy: 0.94


In [35]:
# Model evaluation using the evaluation metrics
y_pred = model.predict(X_test)
used_labels = np.array(sorted(set(y_test))) 
target_names = target_encoder.inverse_transform(used_labels)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, labels=used_labels, target_names=target_names, zero_division=0))


Classification Report:
                              precision    recall  f1-score   support

               AI Researcher       1.00      1.00      1.00         1
   Cloud Solutions Architect       1.00      1.00      1.00         3
                Data Analyst       1.00      1.00      1.00         1
              Data Scientist       1.00      1.00      1.00         2
      Database Administrator       1.00      1.00      1.00         3
              Game Developer       0.00      0.00      0.00         1
         Graphics Programmer       1.00      1.00      1.00         2
Information Security Analyst       1.00      0.75      0.86         4
   Machine Learning Engineer       1.00      1.00      1.00         3
        Mobile App Developer       1.00      1.00      1.00         4
      NLP Research Scientist       1.00      1.00      1.00         2
           Software Engineer       1.00      1.00      1.00         3
               Web Developer       1.00      1.00      1.00      

In [36]:
# Evaluate how the model generalizes to different parts of the dataset using cross-validation accuracy
print("\nCross-Validation Scores (Stratified 5-fold, with oversampling):")
cv_pipeline = Pipeline([
    ('oversample', RandomOverSampler(random_state=42)),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(cv_pipeline, df[top_features], y, cv=cv)

print("Scores per fold: " + ", ".join(["[{:.4f}]".format(score) for score in cv_scores]))
print("Mean CV Accuracy: {:.2f} ± {:.2f}".format(cv_scores.mean(), cv_scores.std()))


Cross-Validation Scores (Stratified 5-fold, with oversampling):
Scores per fold: [0.8182], [1.0000], [0.9091], [0.9394], [0.8750]
Mean CV Accuracy: 0.91 ± 0.06


In [37]:
# Save the trained Random Forest model and encoders
with open("cprs_model.pkl", "wb") as f:
    pickle.dump(model, f)
    
with open("label_encoders.pkl", "wb") as f:
    pickle.dump(label_encoders, f)

with open("target_encoder.pkl", "wb") as f:
    pickle.dump(target_encoder, f)

In [38]:
# top_features list
with open("top_features.pkl", "wb") as f:
    pickle.dump(top_features, f)

# Save X_train_resampled for LIME
with open("X_train_resampled.pkl", "wb") as f:
    pickle.dump(X_train_resampled, f)