In [3]:
import pandas as pd

# Headers for the CSV file
column_names = [
    'First Term Gpa',
    'Second Term Gpa',
    'First Language',
    'Funding',
    'School',
    'Fast Track',
    'Coop',
    'Residency',
    'Gender',
    'Prev Education',
    'Age Group',
    'High School Average Mark',
    'Math Score',
    'English Grade',
    'First Year Persistence'
]

# File path
file_path = '../Data/student_data.csv'

# Loading the CSV file into a DataFrame
try:
    df = pd.read_csv(
        file_path,
        header=None,          # Telling pandas there is no header row in the file
        names=column_names,   # Providing the list of correct column names
        na_values='?'         # Telling pandas to treat '?' character as missing (NaN)
    )
    print("DataFrame loaded successfully with correct headers and NaN values!")

    # Displaying the first 5 rows to verify headers and data
    print("\nFirst 5 rows:")
    print(df.head())

    # Displaying DataFrame info to verify column names, counts, and Dtypes
    print("\nDataFrame Info:")
    print(df.info())

except FileNotFoundError:
    print(f"\nError: Could not find the file at {file_path}")
    print("Please ensure the file exists and the path is correct.")
except Exception as e:
    print(f"\nAn error occurred while loading the file: {e}")

DataFrame loaded successfully with correct headers and NaN values!

First 5 rows:
   First Term Gpa  Second Term Gpa  First Language  Funding  School  \
0        0.000000         0.000000             1.0        2       6   
1        2.500000         2.000000             3.0        4       6   
2        4.250000         3.923077             1.0        1       6   
3        3.020833         2.321429             3.0        4       6   
4        4.275000         4.326923             1.0        2       6   

   Fast Track  Coop  Residency  Gender  Prev Education  Age Group  \
0           2     1          1       2             1.0        1.0   
1           1     2          2       2             1.0        3.0   
2           2     1          1       1             2.0        3.0   
3           1     2          2       2             2.0        3.0   
4           1     1          1       1             2.0        3.0   

   High School Average Mark  Math Score  English Grade  First Year Persisten

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

def split_data_for_task(predict_persistence, test_size=0.2, random_state=89):
  """
  Splits the DataFrame into training and testing sets for a specific task.

  Args:
    df (pd.DataFrame): The input DataFrame containing features and targets.
                       Assumes '?' has already been replaced with NaN during loading.
    predict_persistence (bool): If True, sets 'First Year Persistence' as the target
                                and enables stratification. If False, sets
                                'Second Term Gpa' as the target and disables
                                stratification by default.
    test_size (float): The proportion of the dataset to include in the test split.
    random_state (int): Controls the shuffling applied to the data before splitting
                        for reproducibility.

  Returns:
    tuple: A tuple containing (X_train, X_test, y_train, y_test) as pandas
           DataFrames/Series. Returns None if target column is not found.
  """

  if predict_persistence:
    target_column = 'First Year Persistence'
    # Ensure target exists before attempting to stratify
    if target_column not in df.columns:
        print(f"Error: Target column '{target_column}' not found in DataFrame.")
        return None
    stratify_on = df[target_column] # Stratify based on the persistence target
  else:
    target_column = 'Second Term Gpa'
    # Ensure target exists
    if target_column not in df.columns:
        print(f"Error: Target column '{target_column}' not found in DataFrame.")
        return None
    stratify_on = None # Stratification often not strictly needed for regression

  # Define features X (dropping only the chosen target by default)
  # Check if target column exists before trying to drop
  if target_column in df.columns:
      X = df.drop(columns=[target_column])
  else:
      # Should have been caught above, but added as safeguard
      print(f"Error: Cannot drop non-existent target column '{target_column}'.")
      return None

  # Define target y
  y = df[target_column]

  # Perform the split
  try:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=test_size,
        random_state=random_state,
        stratify=stratify_on
    )

    print(f"Data split successfully for target: '{target_column}'")
    print(f"Training set shape: X_train={X_train.shape}, y_train={y_train.shape}")
    print(f"Test set shape: X_test={X_test.shape}, y_test={y_test.shape}")

    return X_train, X_test, y_train, y_test

  except Exception as e:
    print(f"Error during train_test_split: {e}")
    # This might happen if stratification fails, e.g., due to NaNs in target
    # Or if X or y are unexpectedly empty.
    return None

In [None]:
split_data_for_task(True, test_size=0.2, random_state=89)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [None]:
# these might be adjusted inside the function
BASE_NUMERICAL_FEATURES = [
    'First Term Gpa',
    'Second Term Gpa', # Will be excluded if predicting persistence
    'High School Average Mark',
    'Math Score'
]

BASE_CAT_NOMINAL_FEATURES = [
    'First Language', # Had NaN
    'Funding',
    'School',         # Only had value 6, OHE will handle this
    'Residency',
    'Gender',
    'Prev Education', # Had NaN (including original 0.0)
    'Fast Track',
    'Coop'
]

BASE_CAT_ORDINAL_FEATURES = [
    'Age Group',     # Had NaN
    'English Grade'  # Had NaN
]

# Defines the explicit order for ordinal features
AGE_GROUP_ORDER = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
ENGLISH_GRADE_ORDER = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0]
ORDINAL_CATEGORIES = [AGE_GROUP_ORDER, ENGLISH_GRADE_ORDER]

In [None]:
def create_preprocessor(predict_persistence):
  """
  Creates a scikit-learn ColumnTransformer for preprocessing,
  adjusting feature lists based on the prediction task.

  Args:
      predict_persistence (bool): If True, prepares features for persistence
                                  prediction (excludes Second Term Gpa).
                                  If False, prepares features for GPA prediction.

  Returns:
      ColumnTransformer: The unfitted preprocessing pipeline structure.
  """

  # --- Dynamically adjusts column lists based on task ---
  if predict_persistence:
    print("Defining preprocessor for Persistence prediction task.")
    # Exclude Second Term Gpa from numerical features
    num_features = [col for col in BASE_NUMERICAL_FEATURES if col != 'Second Term Gpa']
    cat_nom_features = BASE_CAT_NOMINAL_FEATURES
    cat_ord_features = BASE_CAT_ORDINAL_FEATURES
    ord_categories = ORDINAL_CATEGORIES
    # Exclude First Year Persistence if it exists as a column and predict_persistence is True
    # (Should already be excluded when X was created, but as a safeguard)
    # cat_nom_features = [col for col in cat_nom_features if col != 'First Year Persistence']

  else: # predict_persistence is False (predicting Second Term Gpa)
    print("Defining preprocessor for Second Term GPA prediction task.")
    # Use all base numerical features
    num_features = BASE_NUMERICAL_FEATURES
    # Decide if 'First Year Persistence' should be a feature for predicting GPA
    # If yes, add it to the categorical list (assuming it's binary 0/1)
    include_persistence_as_feature = True # Set to False if you don't want to use it
    if include_persistence_as_feature:
         # Ensure 'First Year Persistence' isn't already in another list
         cat_nom_features = list(set(BASE_CAT_NOMINAL_FEATURES + ['First Year Persistence']))
    else:
         cat_nom_features = BASE_CAT_NOMINAL_FEATURES

    cat_ord_features = BASE_CAT_ORDINAL_FEATURES
    ord_categories = ORDINAL_CATEGORIES


  # --- Define Preprocessing Steps ---

  # Pipeline for numerical features: Impute (median + indicator) then Scale
  '''
    # Defines the pipeline for numerical features:
    # 1. 'imputer': Fills missing values (NaN) using the median calculated from the training data.
    #    'add_indicator=True' adds a binary column marking which values were imputed.
    # 2. 'scaler': Scales the data (after imputation) to have zero mean and unit variance
    # using parameters learned from the training data.
  '''
  numeric_transformer = Pipeline(steps=[
      ('imputer', SimpleImputer(strategy='median', add_indicator=True)),
      ('scaler', StandardScaler())
  ])

  # Pipeline for nominal categorical features: Impute (mode) then OneHotEncode
  categorical_transformer_nominal = Pipeline(steps=[
      # Impute NaNs using the most frequent value
      ('imputer', SimpleImputer(strategy='most_frequent')),
      # OneHotEncode the categories, ignore unknown categories encountered during transform
      ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
  ])

  # Pipeline for ordinal categorical features: Impute (mode) then OrdinalEncode
  categorical_transformer_ordinal = Pipeline(steps=[
      # Impute NaNs using the most frequent value
      ('imputer', SimpleImputer(strategy='most_frequent')),
      # Encode categories ordinally based on predefined order
      ('ordinal', OrdinalEncoder(categories=ord_categories, handle_unknown='use_encoded_value', unknown_value=np.nan)), # Handle potential unknowns robustly
      # Impute NaNs possibly created by unknown_value in OrdinalEncoder
      ('imputer_after_ordinal', SimpleImputer(strategy='most_frequent')), # Or use median if scaling after
  ])

  # --- Combine pipelines using ColumnTransformer ---
  print(f"Applying numerical transforms to: {num_features}")
  print(f"Applying nominal categorical transforms to: {cat_nom_features}")
  print(f"Applying ordinal categorical transforms to: {cat_ord_features}")

  preprocessor = ColumnTransformer(
      transformers=[
          ('num', numeric_transformer, num_features),
          ('cat_nom', categorical_transformer_nominal, cat_nom_features),
          ('cat_ord', categorical_transformer_ordinal, cat_ord_features)
      ],
      remainder='drop' # Drops columns not specified in transformers
  )
  return preprocessor