<a href="https://colab.research.google.com/github/theouterlimitz/SDSS_Star_Classification/blob/main/02_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load the clean, prepared dataset
df = pd.read_pickle('cleaned_sdss_data.pkl')

print("Cleaned dataset loaded successfully!")
df.info()

In [None]:
# In your 02_Modeling.ipynb notebook, after loading cleaned_sdss_data.pkl

print("Performing feature engineering: Creating color features...")
df['u-g'] = df['u'] - df['g']
df['g-r'] = df['g'] - df['r']
df['r-i'] = df['r'] - df['i']
df['i-z'] = df['i'] - df['z']

# Now our DataFrame 'df' has 4 new, potentially powerful features.
# We would then proceed with the train-test split and scaling on this new set of features.

print("New color features created successfully.")
print("Updated columns:", df.columns.tolist())

**Prepare Data for Machine Learning**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

print("--- Preparing Data for Machine Learning ---")

# 1. Separate features (X) and target (y)
X = df.drop(columns=['class']) # All columns except our target
y = df['class']               # Just the target column

# 2. Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print("Target labels encoded successfully.")
print(f"Class mapping: {dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))}")

# 3. Split the data into training and testing sets (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)
print(f"\nData split into training and testing sets.")
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")

# 4. Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("\nFeatures scaled successfully.")

**Train and Evaluate Baseline Model (Random Forest)**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns

print("--- Training Random Forest Classifier ---")

# 1. Initialize and Train the Model
rfc = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rfc.fit(X_train_scaled, y_train)
print("Model training complete.")

# 2. Make Predictions and Evaluate
print("\n--- Evaluating Model Performance ---")
y_pred = rfc.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy Score: {accuracy:.4f}")

# 3. Display Detailed Reports
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

print("\nConfusion Matrix:")
fig, ax = plt.subplots(figsize=(8, 6))
ConfusionMatrixDisplay.from_estimator(rfc, X_test_scaled, y_test,
                                      display_labels=label_encoder.classes_,
                                      cmap='Blues', ax=ax)
plt.title('Confusion Matrix for Random Forest Classifier')
plt.show()

# 4. Analyze Feature Importance
print("\n--- Analyzing Feature Importance ---")
importances = rfc.feature_importances_
feature_importance_df = pd.Series(importances, index=X.columns).sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importance_df.values, y=feature_importance_df.index, palette='mako')
plt.title('Feature Importance from Random Forest', fontsize=16)
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.show()

**Building the Neural Network**

In [None]:
import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# --- Assume these variables are loaded in your notebook's memory ---
# X_train_scaled, X_test_scaled, y_train, y_test
# label_encoder (for decoding labels later)

# ===================================================================
# --- Step 1: Build and Train the Neural Network ---
# ===================================================================
print("--- Building the Neural Network ---")

# Define the Model Architecture
model = keras.Sequential([
    # Input layer: The shape must match the number of features (9)
    keras.layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    keras.layers.Dropout(0.3),  # Dropout helps prevent overfitting

    # Hidden layer 1
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dropout(0.3),

    # Hidden layer 2
    keras.layers.Dense(32, activation='relu'),

    # Output layer: It must have 3 neurons (one for each class)
    # 'softmax' is used for multi-class classification to output a probability for each class.
    keras.layers.Dense(3, activation='softmax')
])

# Compile the Model
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

print("Model built and compiled successfully.")
model.summary()

# Train the Model
print("\n--- Training the Neural Network (this will take a few minutes) ---")
history = model.fit(
    X_train_scaled,
    y_train,
    epochs=20,          # We'll do 20 passes through the training data
    batch_size=32,      # Process data in batches of 32
    validation_split=0.2, # Use 20% of training data for validation during training
    verbose=1           # Show the progress bar
)

# ===================================================================
# --- Step 2: Visualize Training History ---
# ===================================================================
print("\n--- Visualizing Training History ---")
history_df = pd.DataFrame(history.history)
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history_df['accuracy'], label='Train Accuracy')
plt.plot(history_df['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy over Epochs')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(loc='lower right')
plt.subplot(1, 2, 2)
plt.plot(history_df['loss'], label='Train Loss')
plt.plot(history_df['val_loss'], label='Validation Loss')
plt.title('Model Loss over Epochs')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(loc='upper right')
plt.tight_layout()
plt.show()

# ===================================================================
# --- Step 3: Evaluate Final Performance on the Unseen Test Set ---
# ===================================================================
print("\n--- Evaluating Final Model Performance on Test Data ---")

# Evaluate the model on the test set to get final loss and accuracy
loss, accuracy = model.evaluate(X_test_scaled, y_test, verbose=0)
print(f"Final Test Accuracy: {accuracy:.4f}")

# To generate a classification report and confusion matrix, we need predictions
y_pred_probs = model.predict(X_test_scaled)
y_pred_classes = np.argmax(y_pred_probs, axis=1)

# Classification Report
print("\nClassification Report (Neural Network):")
print(classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_))

# Confusion Matrix
print("\nConfusion Matrix (Neural Network):")
fig, ax = plt.subplots(figsize=(8, 6))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_classes,
                                        display_labels=label_encoder.classes_,
                                        cmap='cividis',
                                        ax=ax)
plt.title('Confusion Matrix for Neural Network')
plt.show()

**Advanced Network**

In [None]:
import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from sklearn.utils import class_weight # Make sure to import this
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# --- Assume these variables are loaded in your notebook's memory ---
# X_train_scaled, X_test_scaled, y_train, y_test
# label_encoder

# ===================================================================
# --- Step 7: Build, Train, and Evaluate an Advanced Neural Network ---
# ===================================================================
print("--- Building Advanced Neural Network (V2 with Class Weights) ---")

# 1. Define the Model Architecture (same as before)
model_v2_weighted = keras.Sequential([
    keras.layers.Dense(256, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    keras.layers.Dropout(0.4),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(3, activation='softmax')
])

# 2. Compile the Model (same as before)
model_v2_weighted.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

print("Model V2 built and compiled successfully.")
model_v2_weighted.summary()


# --- ** NEW CODE GOES HERE ** ---
# 3. Calculate Class Weights to handle imbalance
# This must be done before we train the model.
print("\nCalculating class weights to handle data imbalance...")
class_weights = class_weight.compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = dict(enumerate(class_weights))
print("Calculated Class Weights:", class_weight_dict)
# --- ** END OF NEW CODE ** ---


# 4. Train the New Model (with the added class_weight parameter)
print("\n--- Training Advanced Neural Network with Class Weights ---")
history_v2_weighted = model_v2_weighted.fit(
    X_train_scaled,
    y_train,
    epochs=30,
    batch_size=32,
    validation_split=0.2,
    class_weight=class_weight_dict, # <-- ADD THIS PARAMETER
    verbose=1
)


# ===================================================================
# --- Evaluate the Weighted Model ---
# ===================================================================
print("\n--- Evaluating Final Weighted Model Performance on Test Data ---")

loss_v2_w, accuracy_v2_w = model_v2_weighted.evaluate(X_test_scaled, y_test, verbose=0)
print(f"Final Test Accuracy (Weighted Model V2): {accuracy_v2_w:.4f}")

y_pred_probs_v2_w = model_v2_weighted.predict(X_test_scaled)
y_pred_classes_v2_w = np.argmax(y_pred_probs_v2_w, axis=1)

print("\nClassification Report (Weighted Neural Network V2):")
print(classification_report(y_test, y_pred_classes_v2_w, target_names=label_encoder.classes_))

**Ensemble**

In [None]:
import numpy as np
from sklearn.metrics import classification_report, accuracy_score

# --- Assume these are trained and available in your notebook's memory ---
# rfc: Your trained RandomForestClassifier
# model_v2_weighted: Your trained and weighted Keras Neural Network
# X_test_scaled, y_test, label_encoder

# =======================================================================
# --- Final Experiment: Manually Create and Evaluate an Ensemble Model ---
# =======================================================================
print("--- Building Manual Ensemble (Soft Voting) ---")

# 1. Get the predicted probabilities from both models
print("Getting predictions from Random Forest...")
y_pred_proba_rf = rfc.predict_proba(X_test_scaled)

print("Getting predictions from Neural Network...")
y_pred_proba_nn = model_v2_weighted.predict(X_test_scaled)

# 2. Average the probabilities from both models
# This is the core of "soft voting"
print("Averaging model probabilities...")
y_pred_proba_ensemble = (y_pred_proba_rf + y_pred_proba_nn) / 2.0

# 3. Determine the final class prediction by finding the class with the highest average probability
y_pred_ensemble = np.argmax(y_pred_proba_ensemble, axis=1)
print("Final predictions calculated.")


# 4. Evaluate the Manual Ensemble Model
print("\n--- Evaluating Manual Ensemble Model Performance ---")
accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
print(f"Final Test Accuracy (Manual Ensemble): {accuracy_ensemble:.4f}")

print("\nClassification Report (Manual Ensemble):")
print(classification_report(y_test, y_pred_ensemble, target_names=label_encoder.classes_))

# 5. Generate the Final Confusion Matrix
print("\nConfusion Matrix (Manual Ensemble):")
fig, ax = plt.subplots(figsize=(8, 6))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_ensemble,
                                        display_labels=label_encoder.classes_,
                                        cmap='plasma',
                                        ax=ax)
plt.title('Confusion Matrix for Manual Ensemble Model')
plt.show()