# Importing Libraries and Preparing the Dataset for Employability Prediction

In [None]:
# Essential Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn for ML tasks
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer # To handle different column types
from sklearn.pipeline import Pipeline # To chain preprocessing and model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# For Colab specific tasks (like mounting Drive)
from google.colab import drive

# Mount Google Drive to access the dataset
print("Mounting Google Drive...")
drive.mount('/content/drive', force_remount=True)
print("Drive mounted successfully.")

# Data Loading and Initial Exploration

In [None]:
# Load the dataset - make sure the path is correct for your Drive setup!
try:
    # Path provided by the user
    dataset_path = '/content/drive/My Drive/Colab Notebooks/Student-Employability-Dataset.csv'
    df = pd.read_csv(dataset_path)
    print(f"Dataset '{dataset_path}' loaded successfully.")
    print("\nHead of the dataset:")
    display(df.head()) # Use display() for cleaner output in notebooks
except FileNotFoundError:
    print(f"Error: Dataset not found at '{dataset_path}'. Please check the file path.")
    # Stop execution if file isn't found.
    raise SystemExit("Dataset file not found.")
except Exception as e:
    print(f"An error occurred while loading the dataset: {e}")
    raise SystemExit("Error during dataset loading.")


In [None]:
# Get initial shape and column names
print("Initial shape of the dataset:", df.shape)
print("\nInitial columns of the dataset:\n", df.columns.tolist())

# Code Cell
# Check data types and missing values
print("\nDataset Info:")
df.info()

# Code Cell
# Check unique values and missing values per column
print("\nUnique values and Missing values per column:")
for column in df.columns:
    print(f"- {column}: Unique values = {df[column].nunique()}, Missing values = {df[column].isnull().sum()}")

#  Data Preprocessing for Employability Prediction

 1.  Dropping irrelevant columns.
 2.  Handling Missing Values (if any).
 3.  Defining features (X) and the target (y).
 4.  Encoding the categorical target variable ('CLASS').
 5.  Identifying numerical features for scaling.
 6.  Scaling numerical features (MLPs are sensitive to feature scaling).

In [None]:
# 1. Dropping Irrelevant Columns (as specified by user)
# Ensure the column 'Name of Student' exists before trying to drop it.
columns_to_drop = ['Name of Student']
actual_columns_to_drop = [col for col in columns_to_drop if col in df.columns]

if actual_columns_to_drop:
    df = df.drop(columns=actual_columns_to_drop)
    print(f"Dropped columns: {actual_columns_to_drop}")
else:
    print(f"Specified column(s) to drop ({columns_to_drop}) not found in the dataset's current columns: {df.columns.tolist()}.")
print("Shape after attempting to drop columns:", df.shape)
print("Remaining columns:", df.columns.tolist())

In [None]:
# 2. Handling Missing Values (Illustrative - Adapt as needed)
# This section checks for missing values in potential feature columns.
# IMPORTANT: The target column is assumed to be 'CLASS'. If it's different, update it below.
TARGET_COLUMN_NAME = 'CLASS' # Or 'Employable', etc. - check your CSV

if TARGET_COLUMN_NAME not in df.columns:
    print(f"Error: Target column '{TARGET_COLUMN_NAME}' not found in the DataFrame after dropping columns.")
    print(f"Available columns are: {df.columns.tolist()}")
    raise SystemExit(f"Target column '{TARGET_COLUMN_NAME}' missing.")

potential_feature_cols = df.drop(TARGET_COLUMN_NAME, axis=1, errors='ignore').columns
print(f"\nChecking for missing values in potential feature columns: {potential_feature_cols.tolist()}")
missing_values_in_features = df[potential_feature_cols].isnull().sum().sum()

if missing_values_in_features > 0:
    print(f"Warning: Found {missing_values_in_features} missing values in feature columns.")
    # Example: Fill with median for numerical features
    for col in df.select_dtypes(include=np.number).columns:
        if col != TARGET_COLUMN_NAME and df[col].isnull().sum() > 0 : # Don't impute target here
            df[col] = df[col].fillna(df[col].median())
            print(f"Filled NaNs in numerical feature '{col}' with its median.")
    # Example: Fill with mode for categorical features
    for col in df.select_dtypes(include='object').columns:
         if col != TARGET_COLUMN_NAME and df[col].isnull().sum() > 0 : # Don't impute target here
            df[col] = df[col].fillna(df[col].mode()[0])
            print(f"Filled NaNs in categorical feature '{col}' with its mode.")
else:
    print("No missing values found in potential feature columns (excluding target).")

if df[TARGET_COLUMN_NAME].isnull().sum() > 0:
    print(f"Warning: Target column '{TARGET_COLUMN_NAME}' has {df[TARGET_COLUMN_NAME].isnull().sum()} missing values.")
    # Option 1: Drop rows with missing target
    df.dropna(subset=[TARGET_COLUMN_NAME], inplace=True)
    print(f"Dropped rows with missing target. New shape: {df.shape}")
    # Option 2: Impute target (less common, depends on problem) - not shown here


In [None]:
# 3. Define features (X) and target (y)
# Ensure TARGET_COLUMN_NAME is correct based on your CSV file.
X = df.drop(TARGET_COLUMN_NAME, axis=1)
y = df[TARGET_COLUMN_NAME]

print(f"\nTarget '{TARGET_COLUMN_NAME}' distribution (original):")
print(y.value_counts())
print("\nFeatures (X) shape:", X.shape)
print("Target (y) shape:", y.shape)

if X.empty and not df.empty:
    raise ValueError("Features (X) are empty. This might be due to incorrect column dropping or an empty input file after dropping the target column.")
if X.shape[1] == 0 and df.shape[0] > 0 :
    print(f"Warning: Features DataFrame X has no columns. Check if all columns were dropped or if '{TARGET_COLUMN_NAME}' was the only other column.")


In [None]:
# 4. Encoding the Target Variable
# The target column (e.g., 'CLASS') is likely categorical ('Employable', 'LessEmployable').
# We need to convert it to numerical form for the MLP.
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"\nEncoded target '{TARGET_COLUMN_NAME}' distribution:")
# Show mapping
for i, class_name in enumerate(label_encoder.classes_):
    print(f"'{class_name}' -> {label_encoder.transform([class_name])[0]}")
print(pd.Series(y_encoded).value_counts()) # Show distribution of encoded values

# Store class names for later use in evaluation
encoded_class_names = label_encoder.classes_


In [None]:
# 4. Encoding the Target Variable
# The target column (e.g., 'CLASS') is likely categorical ('Employable', 'LessEmployable').
# We need to convert it to numerical form for the MLP.
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"\nEncoded target '{TARGET_COLUMN_NAME}' distribution:")
# Show mapping
for i, class_name in enumerate(label_encoder.classes_):
    print(f"'{class_name}' -> {label_encoder.transform([class_name])[0]}")
print(pd.Series(y_encoded).value_counts()) # Show distribution of encoded values

# Store class names for later use in evaluation
encoded_class_names = label_encoder.classes_


In [None]:
# 5. Identify numerical and categorical input features for scaling/encoding
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X.select_dtypes(include='object').columns.tolist() # These are input features, not the target

print("\nNumerical input columns for preprocessing:", numerical_cols)
print("Categorical input columns for preprocessing:", categorical_cols)

if not numerical_cols and not categorical_cols and not X.empty:
     print("Warning: No numerical or categorical input features identified in X. Check dtypes. All columns might be of an unexpected type or already processed.")
elif X.empty and df.shape[0] > 0 :
    print("Warning: Features DataFrame X is empty. This could be due to all columns being dropped or used as target.")


# Visualize distributions of identified numerical features
if numerical_cols:
    print("\nHistograms of Numerical Features:")
    X[numerical_cols].hist(bins=15, figsize=(15,10))
    plt.suptitle('Histograms of Student Employability Features', y=1.02)
    plt.tight_layout()
    plt.show()
else:
    print("\nNo numerical features to visualize in X.")



In [None]:
# 6. Create preprocessing pipeline
# Numerical features will be scaled.
# Categorical input features (if any) will be One-Hot Encoded.

transformers_list = []
if numerical_cols:
    transformers_list.append(('num', StandardScaler(), numerical_cols))
if categorical_cols:
    from sklearn.preprocessing import OneHotEncoder # ensure import
    transformers_list.append(('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols))

if not transformers_list:
    print("Warning: No transformers created as no numerical or categorical columns were identified in X for preprocessing.")
    # Create a dummy preprocessor that does nothing if X is meant to be used as-is
    # This is unlikely for MLP but handles the edge case.
    from sklearn.preprocessing import FunctionTransformer
    preprocessor = FunctionTransformer(lambda x: x) # Passthrough
else:
    preprocessor = ColumnTransformer(
        transformers=transformers_list,
        remainder='passthrough' # Keeps columns not specified, if any
    )

# Splitting Data and Building the MLP for Employability Prediction

In [None]:
# Split data into training and testing sets using the ENCODED target
# We split X *before* applying the ColumnTransformer to avoid data leakage.

# Stratify helps maintain class proportions, especially important for smaller or imbalanced datasets.
# Adjust test_size as needed.
X_train, X_test, y_train_encoded, y_test_encoded = train_test_split(
    X, y_encoded,
    test_size=0.25, # Common split ratio
    random_state=42,
    stratify=y_encoded if len(np.unique(y_encoded)) > 1 and len(y_encoded) > sum(np.unique(y_encoded, return_counts=True)[1]) else None # Stratify if possible
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train_encoded shape:", y_train_encoded.shape)
print("y_test_encoded shape:", y_test_encoded.shape)

if X_train.empty or X_test.empty:
    print("Warning: Training or testing set is empty. This is likely due to a very small dataset size after splitting.")
    print("MLP training might fail or be meaningless.")


In [None]:
# Create the MLP model pipeline
# The MLPClassifier will predict the encoded binary classes (e.g., 0 or 1).

# Note on MLP Hyperparameters:
# These are example values. For optimal performance, they should be tuned (e.g., using GridSearchCV).
# For very small datasets, a simpler model or a much smaller MLP is recommended.
mlp = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', MLPClassifier(
        hidden_layer_sizes=[100], # Example: One hidden layer with 100 neurons
        activation='relu',
        solver='adam',
        max_iter=1000,
        learning_rate_init=0.001,
        alpha=0.0001,           # L2 regularization
        random_state=42,
        early_stopping=True,    # Can help prevent overfitting and reduce training time
        validation_fraction=0.1, # Proportion of training data to set aside as validation set for early stopping
        n_iter_no_change=10,    # Number of iterations with no improvement to wait before stopping
        verbose=False           # Set to True to see training progress
    ))
])

print("MLP Pipeline created for Employability Prediction.")

# Check if training data is available
if not X_train.empty and not pd.Series(y_train_encoded).empty:
    print(f"\nTraining the MLP model to predict '{TARGET_COLUMN_NAME}'...")
    try:
        mlp.fit(X_train, y_train_encoded)
        print("Training finished.")
    except Exception as e:
        print(f"An error occurred during MLP training: {e}")
        print("This could be due to issues with preprocessing, data types, or insufficient data.")

else:
    print("\nSkipping MLP training due to empty training set. This usually happens with extremely small initial datasets.")

# Model Evaluation (Employability Prediction)

In [None]:
# Predict on the test data
if not X_test.empty and not pd.Series(y_test_encoded).empty and hasattr(mlp.named_steps.get('classifier'), 'predict'):
    try:
        y_pred_encoded = mlp.predict(X_test)

        # Evaluate the model
        accuracy = accuracy_score(y_test_encoded, y_pred_encoded)
        print(f"\nOverall Accuracy on Test Set: {accuracy:.4f}")

        # Classification Report
        # Use the class names learned by LabelEncoder
        print("\nClassification Report:")
        # zero_division=0 handles cases where a class has no predicted samples in a batch.
        print(classification_report(y_test_encoded, y_pred_encoded, target_names=encoded_class_names, zero_division=0))

        # Confusion Matrix
        conf_matrix = confusion_matrix(y_test_encoded, y_pred_encoded, labels=range(len(encoded_class_names)))
        print("\nConfusion Matrix:")
        print(conf_matrix)

        # Optional: Visualize the confusion matrix
        if len(encoded_class_names) > 0 :
            plt.figure(figsize=(8, 6))
            sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                        xticklabels=encoded_class_names, yticklabels=encoded_class_names)
            plt.title(f'Confusion Matrix for {TARGET_COLUMN_NAME} Prediction')
            plt.xlabel('Predicted Label')
            plt.ylabel('True Label')
            plt.show()
    except Exception as e:
        print(f"An error occurred during model evaluation: {e}")
else:
    print("\nSkipping model evaluation due to empty test set or untrained/invalid model.")

# Single Sample Prediction (Employability Prediction)

In [None]:
# Predict for a single sample
if not X_test.empty and hasattr(mlp.named_steps.get('classifier'), 'predict'):
    # Let's take the first sample from the original X_test (before pipeline's preprocessing)
    sample_to_predict_raw = X_test.iloc[:1]
    print("\nRaw Test Sample for Prediction:\n", sample_to_predict_raw)

    try:
        # Predict probability for this sample using the full pipeline
        # Output will be probabilities for [class_0, class_1, ...]
        prob = mlp.predict_proba(sample_to_predict_raw)
        print(f"\nProbability Results for Sample (Classes: {encoded_class_names}):", prob)

        # The predicted class is the index with the highest probability
        predicted_class_encoded = mlp.predict(sample_to_predict_raw)[0]

        # Interpret the prediction using the label_encoder
        predicted_label_text = label_encoder.inverse_transform([predicted_class_encoded])[0]

        print("\nPredicted Class (encoded): ", predicted_class_encoded)
        print(f"Predicted Label for '{TARGET_COLUMN_NAME}': ", predicted_label_text)
    except Exception as e:
        print(f"Error during single sample prediction: {e}")
        print("This might happen if the model was not trained due to insufficient data or preprocessing issues.")

else:
    print("\nSkipping single sample prediction due to empty test set or untrained/invalid model.")