In [3]:
import pandas as pd
import numpy as np
import random
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

In [6]:
def generate_student_data(num_samples=5000):
    data = []
    hobbies = ['Technical', 'Artistic', 'Literary', 'Business']
    activities = ['Robotics', 'Debating', 'Drama', 'Volunteering', 'Entrepreneurship', 'Finance Club']
    
    for _ in range(num_samples):
        logic = random.randint(1, 5)
        creative = random.randint(1, 5)
        leadership = random.randint(1, 5)
        
        math = min(100, max(0, int(random.gauss(60 + (logic * 5), 10))))
        science = min(100, max(0, int(random.gauss(60 + (logic * 5), 10))))
        social = min(100, max(0, int(random.gauss(60 + (creative * 3) + (leadership * 2), 10))))
        english = min(100, max(0, int(random.gauss(60 + (creative * 5), 10))))
     
        if logic >= 4:
            hobby = random.choices(hobbies, weights=[0.6, 0.1, 0.1, 0.2])[0]
        elif creative >= 4:
            hobby = random.choices(hobbies, weights=[0.1, 0.5, 0.3, 0.1])[0]
        elif leadership >= 4:
            hobby = random.choices(hobbies, weights=[0.1, 0.1, 0.2, 0.6])[0]
        else:
            hobby = random.choice(hobbies)
    
        if hobby == 'Technical':
            activity = random.choice(['Robotics', 'Volunteering'])
        elif hobby == 'Business':
            activity = random.choice(['Entrepreneurship', 'Finance Club', 'Volunteering'])
        elif hobby in ['Artistic', 'Literary']:
            activity = random.choice(['Drama', 'Debating', 'Volunteering'])
        else:
            activity = random.choice(activities)

        science_score = (math * 0.4) + (science * 0.4) + (logic * 5)
        if hobby == 'Technical': science_score += 15
        if activity == 'Robotics': science_score += 10
        
        commerce_score = (math * 0.3) + (social * 0.4) + (leadership * 5)
        if hobby == 'Business': commerce_score += 15
        if activity in ['Entrepreneurship', 'Finance Club']: commerce_score += 10
        
        humanities_score = (social * 0.4) + (english * 0.4) + (creative * 5)
        if hobby in ['Artistic', 'Literary']: humanities_score += 15
        if activity in ['Drama', 'Debating']: humanities_score += 10

        if science_score >= commerce_score and science_score >= humanities_score:
            stream = "Science"
        elif commerce_score > science_score and commerce_score >= humanities_score:
            stream = "Commerce"
        else:
            stream = "Humanities"
            
        data.append([math, science, social, english, hobby, activity, logic, creative, leadership, stream])

    columns = ['math_marks', 'science_marks', 'social_marks', 'english_marks', 
               'hobby', 'activity', 'logic_score', 'creative_score', 'leadership_score', 'stream']
    
    return pd.DataFrame(data, columns=columns)

df = generate_student_data(5000)
print(f"Dataset generated with {df.shape[0]} rows.")
df.head()
df.to_csv('stream_dataset.csv', index=False)
print("Dataset saved as stream_dataset.csv")

Dataset generated with 5000 rows.
Dataset saved as stream_dataset.csv


In [None]:
X = df.drop('stream', axis=1)
y = df['stream']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
categorical_features = ['hobby', 'activity']
numeric_features = ['math_marks', 'science_marks', 'social_marks', 'english_marks', 
                    'logic_score', 'creative_score', 'leadership_score']
#Using OnHotEncoder 
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

model_pipeline.fit(X_train, y_train)
print("Model training complete")

Model training complete


In [None]:
y_pred = model_pipeline.predict(X_test)

# CHECKING ACCURACY OF THE TRAINED MODEL
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Model Accuracy: 95.30%

Classification Report:
              precision    recall  f1-score   support

    Commerce       0.90      0.98      0.94       197
  Humanities       0.96      0.96      0.96       407
     Science       0.98      0.94      0.96       396

    accuracy                           0.95      1000
   macro avg       0.94      0.96      0.95      1000
weighted avg       0.95      0.95      0.95      1000



In [None]:
# CHECKING WHETHER THE MODEL IS PREDICTING THE STREAM OR NOT
new_student_data = pd.DataFrame({
    'math_marks': [85],
    'science_marks': [90],
    'social_marks': [60],
    'english_marks': [70],
    'hobby': ['Technical'],
    'activity': ['Robotics'],
    'logic_score': [5],
    'creative_score': [3],
    'leadership_score': [2]
})
prediction = model_pipeline.predict(new_student_data)
print(f"Predicted Stream: {prediction[0]}")

Predicted Stream: Science


In [None]:
joblib.dump(model_pipeline, '10_stream_predictor_model.pkl')
print("Model saved as '10_stream_predictor_model.pkl'")

Model saved as '10_stream_predictor_model.pkl'
