In [None]:
import pandas as pd
import numpy as np

# --- Configuration ---
NUM_SAMPLES = 40000
np.random.seed(42)

# Define the personality types and target streams
RIASEC_TYPES = ['R', 'I', 'A', 'S', 'E', 'C']
STREAMS = ['Science', 'Commerce', 'Arts', 'Vocational']

# Define the logical mapping from RIASEC type to the most likely stream
TYPE_TO_STREAM_MAP = {
    'R': 'Vocational',
    'I': 'Science',
    'A': 'Arts',
    'S': 'Arts',
    'E': 'Commerce',
    'C': 'Commerce'
}

# --- Data Generation ---
data = []
for _ in range(NUM_SAMPLES):
    primary_type = np.random.choice(RIASEC_TYPES)
    target_stream = TYPE_TO_STREAM_MAP[primary_type]
    if np.random.rand() < 0.15: 
        if primary_type == 'I': target_stream = 'Vocational'
        if primary_type == 'R': target_stream = 'Science'
    
    scores = []
    for i in range(len(RIASEC_TYPES)):
        if RIASEC_TYPES[i] == primary_type:
            score1, score2 = np.random.randint(3, 6), np.random.randint(3, 6)
        else:
            score1, score2 = np.random.randint(1, 4), np.random.randint(1, 4)
        scores.extend([score1, score2])
    
    row = {
        'Q1_Realistic': scores[0], 'Q2_Realistic': scores[1],
        'Q3_Investigative': scores[2], 'Q4_Investigative': scores[3],
        'Q5_Artistic': scores[4], 'Q6_Artistic': scores[5],
        'Q7_Social': scores[6], 'Q8_Social': scores[7],
        'Q9_Enterprising': scores[8], 'Q10_Enterprising': scores[9],
        'Q11_Conventional': scores[10], 'Q12_Conventional': scores[11],
        'PrimaryStream': target_stream
    }
    data.append(row)

# --- Save the Final, Imbalanced Dataset ---
df = pd.DataFrame(data)
output_filename = 'dataset.csv'
df.to_csv(output_filename, index=False)

print(f"--- Original Imbalanced Dataset Saved ---")
print(f"Dataset saved to '{output_filename}' with shape: {df.shape}")
print("\nFinal class distribution:")
print(df['PrimaryStream'].value_counts())

--- Original Imbalanced Dataset Saved ---
Dataset saved to 'dataset.csv' with shape: (40000, 13)

Final class distribution:
PrimaryStream
Arts          13491
Commerce      13261
Vocational     6671
Science        6577
Name: count, dtype: int64
