# Math Learning Outcome Prediction

This notebook predicts math learning outcomes (PostMath scores) using behavioral and learning process variables.

## Prediction Variables:
- **Pre-test score**: Baseline proficiency predictor
- **Error rate**: Total errors / Total attempts (reflects struggle)
- **Average error rate**: Errors made / Problems completed
- **Hint request rate**: Hints requested / Problems completed
- **Total hints requested**: Raw hint usage
- **Average help level**: Mean help level (excluding 0)
- **Skills mastery rate**: Skills mastered / Skills encountered
- **Workspace completion rate**: Graduated workspaces / Total workspaces
- **Average workspace duration**: Mean time spent per workspace

## Evaluation Metrics:
- R-squared (R²)
- Root Mean Square Error (RMSE)
- Mean Absolute Error (MAE)


In [12]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")


Libraries imported successfully!


# Training Data Preprocessing

In [13]:
df_main = pd.read_csv("./raw_data/training_set_with_formatted_time.csv")
df_ws = pd.read_csv("./raw_data/workspace_summary_train.csv")
df_scores = pd.read_csv("./raw_data/student_scores_train.csv")

df_main.drop_duplicates(inplace=True)
df_ws.drop_duplicates(inplace=True)
df_scores.drop_duplicates(inplace=True)

In [14]:
# Remove unnecessary columns
df_main.drop(columns=['CF..Anon.School.Id.', 'CF..Anon.Class.Id.', 'Time', 'formatted_time'], inplace=True)

# Remove rows containing 'OK_AMBIGUOUS'
df_main = df_main[df_main['Outcome'] != 'OK_AMBIGUOUS']

df_main.sort_values(by=['Anon.Student.Id', 'datetime'], inplace=True)

df_main['datetime'] = pd.to_datetime(
    df_main['datetime'],
    infer_datetime_format=True,
    errors='coerce'       # turns invalid parses into NaT
)

# Generate time steps
df_main['time_step'] = df_main.groupby('Anon.Student.Id')['datetime'].rank(method='first') - 1 

In [15]:
df_main.to_csv('preprocessed_data/df_main_allws.csv', index=False)
df_ws.to_csv('preprocessed_data/df_ws_allws.csv', index=False)

In [16]:
workspace_ids_to_remove = [
    'worksheet_grapher_a1_lin_mod_mult_rep',
    'equation_line_2',
    'analyzing_models_2step_rationals',
    'multiple_representations_of_linear_functions',
    'worksheet_grapher_a1_slope_intercept_integer',
    'worksheet_grapher_a1_slope_intercept_decimal',
    'connecting_slope_intercept_and_point_slope_forms',
    'equation_line_1',
    'equation_line_3',
    'worksheet_grapher_a1_mod_initial_plus_point',
    'worksheet_grapher_a1_mod_two_points',
    'modeling_linear_equations_in_standard_form',
    'graph_setup_linear_equation-1',
    'graph_setup_linear_equation-2',
    'classifying_relations_and_functions',
    'introduction_to_functions',
    'graphs_of_functions',
    'graphs_of_functions-1',
    'compare_functions_diff_reps_linear_relationships'
]


df_main = df_main[~df_main['Level..Workspace.Id.'].isin(workspace_ids_to_remove)]
df_ws = df_ws[~df_ws['workspace'].isin(workspace_ids_to_remove)]

In [17]:
df_cleaned_math = df_scores[["Anon.Student.Id", "PreMath", "PostMath"]].copy()
df_cleaned_math = df_cleaned_math.dropna(subset=["PostMath"])

In [18]:
# convert all to csv and store in processed_data folder
df_main.to_csv("preprocessed_data/df_main.csv", index=False)
df_ws.to_csv("preprocessed_data/df_ws.csv", index=False)
df_scores.to_csv("preprocessed_data/df_scores.csv", index=False)
df_cleaned_math.to_csv("preprocessed_data/df_cleaned_math.csv", index=False)

# Loading Preprocessed Datasets

In [19]:
# Load the datasets
print("Loading datasets...")

# Load math scores (target variable)
math_scores = pd.read_csv('preprocessed_data/df_cleaned_math.csv')
print(f"Math scores dataset shape: {math_scores.shape}")

# Load workspace behavioral data
workspace_data = pd.read_csv('preprocessed_data/df_ws.csv')
print(f"Workspace data shape: {workspace_data.shape}")

# Load main interaction data for help levels
main_data = pd.read_csv('preprocessed_data/df_main.csv')
print(f"Main interaction data shape: {main_data.shape}")

print("\nDatasets loaded successfully!")

Loading datasets...
Math scores dataset shape: (488, 3)
Workspace data shape: (5195, 17)
Main interaction data shape: (856606, 13)

Datasets loaded successfully!


In [20]:
# Feature Engineering: Calculate student-level aggregated features
print("Creating behavioral features...")

# Group workspace data by student
student_features = workspace_data.groupby('Anon.Student.Id').agg({
    # Basic counts and totals
    'problems_completed': ['sum', 'mean'],
    'hint_count': ['sum', 'mean'],
    'error_count': ['sum', 'mean'],
    'skills_encountered': ['sum', 'mean'],
    'skills_mastered': ['sum', 'mean'],
    'workspace_total_time_seconds': ['sum', 'mean', 'std'],
    'workspace_progress_status': 'count'  # Total workspaces
}).round(4)

# Flatten column names
student_features.columns = ['_'.join(col).strip() for col in student_features.columns]

# Calculate derived features
print("Calculating derived behavioral features...")

# Error rates
student_features['total_error_rate'] = (
    student_features['error_count_sum'] / 
    (student_features['error_count_sum'] + student_features['problems_completed_sum'])
).fillna(0)

student_features['avg_error_rate'] = (
    student_features['error_count_sum'] / student_features['problems_completed_sum']
).fillna(0)

# Hint request rates
student_features['hint_request_rate'] = (
    student_features['hint_count_sum'] / student_features['problems_completed_sum']
).fillna(0)

# Skills mastery rate
student_features['skills_mastery_rate'] = (
    student_features['skills_mastered_sum'] / student_features['skills_encountered_sum']
).fillna(0)

# Workspace completion analysis
workspace_completion = workspace_data.groupby('Anon.Student.Id').agg({
    'workspace_progress_status': lambda x: (x == 'GRADUATED').sum(),
    'Anon.Student.Id': 'count'
})
workspace_completion.columns = ['graduated_workspaces', 'total_workspaces']
workspace_completion['workspace_completion_rate'] = (
    workspace_completion['graduated_workspaces'] / workspace_completion['total_workspaces']
)

# Merge completion rates
student_features = student_features.join(workspace_completion[['workspace_completion_rate']], how='left')

print(f"Student features shape: {student_features.shape}")
print(f"Features created: {len(student_features.columns)} variables")


Creating behavioral features...
Calculating derived behavioral features...
Student features shape: (557, 19)
Features created: 19 variables


In [21]:
# Calculate average help level from main interaction data
print("Calculating average help level...")

# Filter out help level 0 and calculate mean help level per student
help_level_data = main_data[main_data['Help.Level'] > 0].groupby('Anon.Student.Id').agg({
    'Help.Level': ['mean', 'std', 'count']
})
help_level_data.columns = ['avg_help_level', 'help_level_std', 'help_requests_count']

# Merge with student features
student_features = student_features.join(help_level_data, how='left')

print(f"Help level features added. Shape: {student_features.shape}")

Calculating average help level...
Help level features added. Shape: (557, 22)


In [22]:
# Merge with math scores to create final dataset
print("Creating final dataset...")

# Reset index to make Anon.Student.Id a column
student_features_reset = student_features.reset_index()

# Merge with math scores
final_dataset = math_scores.merge(student_features_reset, on='Anon.Student.Id', how='inner')

print(f"Final dataset shape: {final_dataset.shape}")
print(f"Students with complete data: {len(final_dataset)}")

# Display the feature names
print("\nAvailable features:")
feature_cols = [col for col in final_dataset.columns if col not in ['Anon.Student.Id', 'PostMath']]
for i, col in enumerate(feature_cols, 1):
    print(f"{i:2d}. {col}")

Creating final dataset...
Final dataset shape: (488, 25)
Students with complete data: 488

Available features:
 1. PreMath
 2. problems_completed_sum
 3. problems_completed_mean
 4. hint_count_sum
 5. hint_count_mean
 6. error_count_sum
 7. error_count_mean
 8. skills_encountered_sum
 9. skills_encountered_mean
10. skills_mastered_sum
11. skills_mastered_mean
12. workspace_total_time_seconds_sum
13. workspace_total_time_seconds_mean
14. workspace_total_time_seconds_std
15. workspace_progress_status_count
16. total_error_rate
17. avg_error_rate
18. hint_request_rate
19. skills_mastery_rate
20. workspace_completion_rate
21. avg_help_level
22. help_level_std
23. help_requests_count


In [23]:
# Prepare features and target for modeling
print("Preparing data for modeling...")

# Define feature columns (exclude ID and target)
feature_columns = [col for col in final_dataset.columns 
                  if col not in ['Anon.Student.Id', 'PostMath']]

X = final_dataset[feature_columns].copy()
y = final_dataset['PostMath'].copy()

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")
print(f"Number of features: {len(feature_columns)}")

# Check for any remaining missing values
if X.isnull().sum().sum() > 0:
    print("\nWarning: Missing values detected. Filling with median...")
    X = X.fillna(X.median())

# Remove infinite values
X = X.replace([np.inf, -np.inf], np.nan).fillna(X.median())

# Split the data
print("\nSplitting data into train/test sets...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=None
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Training target mean: {y_train.mean():.4f}")
print(f"Test target mean: {y_test.mean():.4f}")


Preparing data for modeling...
Feature matrix shape: (488, 23)
Target vector shape: (488,)
Number of features: 23


Splitting data into train/test sets...
Training set: 390 samples
Test set: 98 samples
Training target mean: 0.6535
Test target mean: 0.6407


In [24]:
# Define evaluation function
def evaluate_model(model, X_train, y_train, X_test, y_test, model_name):
    """Evaluate model using R², RMSE, and MAE"""
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate metrics
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    
    # Cross-validation R²
    cv_r2 = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    cv_r2_mean = cv_r2.mean()
    cv_r2_std = cv_r2.std()
    
    results = {
        'model': model_name,
        'train_r2': train_r2,
        'test_r2': test_r2,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'train_mae': train_mae,
        'test_mae': test_mae,
        'cv_r2_mean': cv_r2_mean,
        'cv_r2_std': cv_r2_std
    }
    
    return results, y_test_pred

print("Evaluation function defined.")


Evaluation function defined.


In [25]:
# Initialize models to test
models = {
     'Extra Trees': ExtraTreesRegressor(
        n_estimators=100, max_depth=10, min_samples_split=5,
        min_samples_leaf=2, random_state=42, n_jobs=-1
    )
}

# Train and evaluate all models
results_list = []
model_predictions = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Fit the model
    model.fit(X_train, y_train)
    
    # Evaluate the model
    results, predictions = evaluate_model(model, X_train, y_train, X_test, y_test, name)
    results_list.append(results)
    model_predictions[name] = predictions
    
    print(f"Test R²: {results['test_r2']:.4f}, Test RMSE: {results['test_rmse']:.4f}, Test MAE: {results['test_mae']:.4f}")

print("\nAll models trained successfully!")



Training Extra Trees...
Test R²: 0.8588, Test RMSE: 0.1068, Test MAE: 0.0613

All models trained successfully!


In [26]:
# ==============================================================================
# SAVE THE TRAINED MODEL
# ==============================================================================

import pickle

print("Saving math model...")

# Save the model with feature columns
with open('math_model.pkl', 'wb') as f:
    pickle.dump((model, feature_columns), f)

print("Math model saved as 'math_model.pkl'")
print(f"Model type: ExtraTreesRegressor")
print(f"Features: {len(feature_columns)}")

Saving math model...
Math model saved as 'math_model.pkl'
Model type: ExtraTreesRegressor
Features: 23


# Run Model on Test Dataset

In [None]:
test_main = pd.read_csv('path/to/cleaned/test_df_main.csv') # Replace with path to test time series dataset
test_ws = pd.read_csv('path/to/cleaned/test_ws_dataset.csv') # Replace with path to test workspace dataset
test_cleaned_math = pd.read_csv('path/to/cleaned/test_cleaned_math.csv') # Replace with path to test metacognition dataset

In [None]:
# Feature Engineering: Calculate student-level aggregated features
print("Creating behavioral features...")

# Group workspace data by student
test_student_features = test_ws.groupby('Anon.Student.Id').agg({
    # Basic counts and totals
    'problems_completed': ['sum', 'mean'],
    'hint_count': ['sum', 'mean'],
    'error_count': ['sum', 'mean'],
    'skills_encountered': ['sum', 'mean'],
    'skills_mastered': ['sum', 'mean'],
    'workspace_total_time_seconds': ['sum', 'mean', 'std'],
    'workspace_progress_status': 'count'  # Total workspaces
}).round(4)

# Flatten column names
test_student_features.columns = ['_'.join(col).strip() for col in student_features.columns]

# Calculate derived features
print("Calculating derived behavioral features...")

# Error rates
test_student_features['total_error_rate'] = (
    test_student_features['error_count_sum'] / 
    (test_student_features['error_count_sum'] + test_student_features['problems_completed_sum'])
).fillna(0)

test_student_features['avg_error_rate'] = (
    test_student_features['error_count_sum'] / test_student_features['problems_completed_sum']
).fillna(0)

# Hint request rates
test_student_features['hint_request_rate'] = (
    test_student_features['hint_count_sum'] / test_student_features['problems_completed_sum']
).fillna(0)

# Skills mastery rate
test_student_features['skills_mastery_rate'] = (
    test_student_features['skills_mastered_sum'] / test_student_features['skills_encountered_sum']
).fillna(0)

# Workspace completion analysis
workspace_completion = test_ws.groupby('Anon.Student.Id').agg({
    'workspace_progress_status': lambda x: (x == 'GRADUATED').sum(),
    'Anon.Student.Id': 'count'
})
workspace_completion.columns = ['graduated_workspaces', 'total_workspaces']
workspace_completion['workspace_completion_rate'] = (
    workspace_completion['graduated_workspaces'] / workspace_completion['total_workspaces']
)

# Merge completion rates
test_student_features = test_student_features.join(workspace_completion[['workspace_completion_rate']], how='left')

print(f"Student features shape: {test_student_features.shape}")
print(f"Features created: {len(test_student_features.columns)} variables")


In [None]:
# Calculate average help level from main interaction data
print("Calculating average help level...")

# Filter out help level 0 and calculate mean help level per student
help_level_data = test_main[test_main['Help.Level'] > 0].groupby('Anon.Student.Id').agg({
    'Help.Level': ['mean', 'std', 'count']
})
help_level_data.columns = ['avg_help_level', 'help_level_std', 'help_requests_count']

# Merge with student features
test_student_features = test_student_features.join(help_level_data, how='left')

print(f"Help level features added. Shape: {test_student_features.shape}")

In [None]:
# Merge with test math scores to create final test dataset
print("Creating final test dataset...")
test_student_features_reset = test_student_features.reset_index()
final_test_dataset = test_cleaned_math.merge(test_student_features_reset, on='Anon.Student.Id', how='inner')
print(f"Final test dataset shape: {final_test_dataset.shape}")

# Prepare features for prediction
print("Preparing test data for prediction...")
feature_columns = [col for col in final_test_dataset.columns 
                  if col not in ['Anon.Student.Id', 'PostMath']]
X_test_new = final_test_dataset[feature_columns].copy()

# Handle missing values and infinite values in test data (consistent with training)
if X_test_new.isnull().sum().sum() > 0:
    print("Warning: Missing values detected in test data. Filling with median...")
    X_test_new = X_test_new.fillna(X_test_new.median())
X_test_new = X_test_new.replace([np.inf, -np.inf], np.nan).fillna(X_test_new.median())

print(f"Test feature matrix shape: {X_test_new.shape}")

# Make predictions using the trained Extra Trees model
print("Making predictions on the test dataset...")
extra_trees_model = models['Extra Trees']
test_predictions = extra_trees_model.predict(X_test_new)

# Save predictions to a CSV file
predictions_df = pd.DataFrame({'Anon.Student.Id': final_test_dataset['Anon.Student.Id'], 'PredictedPostMath': test_predictions})
predictions_df.to_csv('test_predictions.csv', index=False)
print("Predictions saved to test_predictions.csv")