In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import joblib # Used for saving the model

# --- Step 1: Load the Dataset ---
df=pd.read_csv('dataset.csv')
print("Dataset loaded successfully!")
df.head()



Dataset loaded successfully!


Unnamed: 0,Q1_Realistic,Q2_Realistic,Q3_Investigative,Q4_Investigative,Q5_Artistic,Q6_Artistic,Q7_Social,Q8_Social,Q9_Enterprising,Q10_Enterprising,Q11_Conventional,Q12_Conventional,PrimaryStream
0,3,1,1,3,2,3,5,5,3,1,3,2,Arts
1,2,2,2,1,1,2,2,1,3,3,3,3,Commerce
2,3,2,4,5,2,3,3,1,3,1,3,3,Vocational
3,2,1,2,2,2,1,2,1,4,5,3,1,Commerce
4,2,2,2,2,4,4,2,1,3,2,2,2,Arts


In [66]:

from imblearn.over_sampling import SMOTE


# 2. Separate features and target
X = df.drop('PrimaryStream', axis=1)
y = df['PrimaryStream']

# 3. Encode the target variable
label_encoder = LabelEncoder()
y_enc = label_encoder.fit_transform(y)

# 4. Split data into training and testing sets FIRST
# The test set will remain imbalanced, reflecting the real world
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_enc # Stratify to keep proportions in test set
)

print("--- Before SMOTE ---")
print("Training set shape:", X_train.shape)
# Use np.bincount to see class distribution
unique, counts = np.unique(y_train, return_counts=True)
print("Training set distribution:", dict(zip(unique, counts)))


# 5. Apply SMOTE ONLY to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("\n--- After SMOTE ---")
print("Resampled training set shape:", X_train_resampled.shape)
unique, counts = np.unique(y_train_resampled, return_counts=True)
print("Resampled training set distribution:", dict(zip(unique, counts)))




# Your data (X_train_scaled, y_train_resampled, X_test_scaled, y_test) is now ready for training

--- Before SMOTE ---
Training set shape: (32000, 12)
Training set distribution: {np.int64(0): np.int64(10793), np.int64(1): np.int64(10609), np.int64(2): np.int64(5261), np.int64(3): np.int64(5337)}

--- After SMOTE ---
Resampled training set shape: (43172, 12)
Resampled training set distribution: {np.int64(0): np.int64(10793), np.int64(1): np.int64(10793), np.int64(2): np.int64(10793), np.int64(3): np.int64(10793)}


In [67]:
# Initialize and train the SVM model
svm_model = SVC(kernel='rbf',random_state=40,C=1.0,gamma='scale',probability=True)
svm_model.fit(X_train_resampled, y_train_resampled)
    
# Make predictions on the test set
y_pred = svm_model.predict(X_test)
    
    

In [68]:
# Print the classification report
print("Classification Report:")
# Get the class names from the fitted LabelEncoder
class_names = label_encoder.classes_

# Pass the class names to the report
print(classification_report(y_test, y_pred, target_names=class_names))
    
# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
    
# Print the accuracy score
print("Accuracy Score:")
print(accuracy_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

        Arts       0.99      0.98      0.98      2698
    Commerce       0.98      0.98      0.98      2652
     Science       0.83      0.82      0.83      1316
  Vocational       0.81      0.85      0.83      1334

    accuracy                           0.93      8000
   macro avg       0.91      0.91      0.91      8000
weighted avg       0.93      0.93      0.93      8000

Confusion Matrix:
[[2642   25   15   16]
 [  18 2593   17   24]
 [   8    8 1081  219]
 [   5    7  185 1137]]
Accuracy Score:
0.931625


In [70]:
# --- 4. Save the Model and Preprocessors ---
# This is the key step
joblib.dump(svm_model, 'svm_model.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')


['label_encoder.pkl']