In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import Pipeline
import warnings
import pickle
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")


# Test Data Preprocessing

In [None]:
# REPLACE WITH TEST DATASET FILE PATHS
df_main = pd.read_csv("./raw_data/training_set_with_formatted_time.csv")
df_ws = pd.read_csv("./raw_data/workspace_summary_train.csv")
df_scores = pd.read_csv("./raw_data/student_scores_train.csv")

df_main.drop_duplicates(inplace=True)
df_ws.drop_duplicates(inplace=True)
df_scores.drop_duplicates(inplace=True)

In [None]:
# Remove unnecessary columns
df_main.drop(columns=['CF..Anon.School.Id.', 'CF..Anon.Class.Id.', 'Time', 'formatted_time'], inplace=True)

# Remove rows containing 'OK_AMBIGUOUS'
df_main = df_main[df_main['Outcome'] != 'OK_AMBIGUOUS']

df_main.sort_values(by=['Anon.Student.Id', 'datetime'], inplace=True)

df_main['datetime'] = pd.to_datetime(
    df_main['datetime'],
    infer_datetime_format=True,
    errors='coerce'       # turns invalid parses into NaT
)

# Generate time steps
df_main['time_step'] = df_main.groupby('Anon.Student.Id')['datetime'].rank(method='first') - 1 

In [None]:
# REPLACE WITH TEST DESIRED FILE PATHS
df_main.to_csv('preprocessed_data/df_main_allws.csv', index=False)
df_ws.to_csv('preprocessed_data/df_ws_allws.csv', index=False)

In [None]:
workspace_ids_to_remove = [
    'worksheet_grapher_a1_lin_mod_mult_rep',
    'equation_line_2',
    'analyzing_models_2step_rationals',
    'multiple_representations_of_linear_functions',
    'worksheet_grapher_a1_slope_intercept_integer',
    'worksheet_grapher_a1_slope_intercept_decimal',
    'connecting_slope_intercept_and_point_slope_forms',
    'equation_line_1',
    'equation_line_3',
    'worksheet_grapher_a1_mod_initial_plus_point',
    'worksheet_grapher_a1_mod_two_points',
    'modeling_linear_equations_in_standard_form',
    'graph_setup_linear_equation-1',
    'graph_setup_linear_equation-2',
    'classifying_relations_and_functions',
    'introduction_to_functions',
    'graphs_of_functions',
    'graphs_of_functions-1',
    'compare_functions_diff_reps_linear_relationships'
]


df_main = df_main[~df_main['Level..Workspace.Id.'].isin(workspace_ids_to_remove)]
df_ws = df_ws[~df_ws['workspace'].isin(workspace_ids_to_remove)]

In [None]:
df_cleaned_math = df_scores[["Anon.Student.Id", "PreMath", "PostMath"]].copy()
df_cleaned_math = df_cleaned_math.dropna(subset=["PostMath"])

In [None]:
# REPLACE WITH DESIRED FILE PATHS
df_main.to_csv("preprocessed_data/df_main.csv", index=False)
df_ws.to_csv("preprocessed_data/df_ws.csv", index=False)
df_scores.to_csv("preprocessed_data/df_scores.csv", index=False)
df_cleaned_math.to_csv("preprocessed_data/df_cleaned_math.csv", index=False)

# Loading Preprocessed Test Data

In [None]:
# Load the datasets
print("Loading datasets...")

# Load math scores (target variable)
math_scores = pd.read_csv('preprocessed_data/df_cleaned_math.csv')
print(f"Math scores dataset shape: {math_scores.shape}")

# Load workspace behavioral data
workspace_data = pd.read_csv('preprocessed_data/df_ws.csv')
print(f"Workspace data shape: {workspace_data.shape}")

# Load main interaction data for help levels
main_data = pd.read_csv('preprocessed_data/df_main.csv')
print(f"Main interaction data shape: {main_data.shape}")

print("\nDatasets loaded successfully!")S

In [None]:
# Feature Engineering: Calculate student-level aggregated features
print("Creating behavioral features...")

# Group workspace data by student
student_features = workspace_data.groupby('Anon.Student.Id').agg({
    # Basic counts and totals
    'problems_completed': ['sum', 'mean'],
    'hint_count': ['sum', 'mean'],
    'error_count': ['sum', 'mean'],
    'skills_encountered': ['sum', 'mean'],
    'skills_mastered': ['sum', 'mean'],
    'workspace_total_time_seconds': ['sum', 'mean', 'std'],
    'workspace_progress_status': 'count'  # Total workspaces
}).round(4)

# Flatten column names
student_features.columns = ['_'.join(col).strip() for col in student_features.columns]

# Calculate derived features
print("Calculating derived behavioral features...")

# Error rates
student_features['total_error_rate'] = (
    student_features['error_count_sum'] / 
    (student_features['error_count_sum'] + student_features['problems_completed_sum'])
).fillna(0)

student_features['avg_error_rate'] = (
    student_features['error_count_sum'] / student_features['problems_completed_sum']
).fillna(0)

# Hint request rates
student_features['hint_request_rate'] = (
    student_features['hint_count_sum'] / student_features['problems_completed_sum']
).fillna(0)

# Skills mastery rate
student_features['skills_mastery_rate'] = (
    student_features['skills_mastered_sum'] / student_features['skills_encountered_sum']
).fillna(0)

# Workspace completion analysis
workspace_completion = workspace_data.groupby('Anon.Student.Id').agg({
    'workspace_progress_status': lambda x: (x == 'GRADUATED').sum(),
    'Anon.Student.Id': 'count'
})
workspace_completion.columns = ['graduated_workspaces', 'total_workspaces']
workspace_completion['workspace_completion_rate'] = (
    workspace_completion['graduated_workspaces'] / workspace_completion['total_workspaces']
)

# Merge completion rates
student_features = student_features.join(workspace_completion[['workspace_completion_rate']], how='left')

print(f"Student features shape: {student_features.shape}")
print(f"Features created: {len(student_features.columns)} variables")

In [None]:
# Calculate average help level from main interaction data
print("Calculating average help level...")

# Filter out help level 0 and calculate mean help level per student
help_level_data = main_data[main_data['Help.Level'] > 0].groupby('Anon.Student.Id').agg({
    'Help.Level': ['mean', 'std', 'count']
})
help_level_data.columns = ['avg_help_level', 'help_level_std', 'help_requests_count']

# Merge with student features
student_features = student_features.join(help_level_data, how='left')

print(f"Help level features added. Shape: {student_features.shape}")

In [None]:
# Merge with math scores to create final dataset
print("Creating final dataset...")

# Reset index to make Anon.Student.Id a column
student_features_reset = student_features.reset_index()

# Merge with math scores
final_dataset = math_scores.merge(student_features_reset, on='Anon.Student.Id', how='inner')

print(f"Final dataset shape: {final_dataset.shape}")
print(f"Students with complete data: {len(final_dataset)}")

# Display the feature names
print("\nAvailable features:")
feature_cols = [col for col in final_dataset.columns if col not in ['Anon.Student.Id', 'PostMath']]
for i, col in enumerate(feature_cols, 1):
    print(f"{i:2d}. {col}")

In [None]:
# ==============================================================================
# LOAD TRAINED MODEL AND MAKE PREDICTIONS
# ==============================================================================

print("Loading trained math model...")

# Load the saved model and feature columns
with open('math_model.pkl', 'rb') as f:
    model, feature_columns = pickle.load(f)

print(f"✓ Model loaded successfully!")
print(f"✓ Model type: {type(model).__name__}")
print(f"✓ Number of features: {len(feature_columns)}")

# Prepare features for prediction
print("\nPreparing features for prediction...")

# Select only the features used during training
X_test = final_dataset[feature_columns].copy()

# Handle missing values (same as training)
print("Handling missing values...")
X_test = X_test.replace([np.inf, -np.inf], np.nan)
X_test = X_test.fillna(0)  # Fill with 0 for missing values

print(f"✓ Test features shape: {X_test.shape}")
print(f"✓ Features with missing values: {X_test.isnull().sum().sum()}")

# Make predictions
print("\nMaking predictions...")
predictions = model.predict(X_test)

print(f"✓ Predictions made for {len(predictions)} students")
print(f"✓ Prediction range: {predictions.min():.2f} to {predictions.max():.2f}")

# Create results dataframe
results_df = pd.DataFrame({
    'Anon.Student.Id': final_dataset['Anon.Student.Id'],
    'PostMath_Predicted': predictions
})

print(f"\nResults shape: {results_df.shape}")
print("\nFirst 10 predictions:")
print(results_df.head(10))

# Save predictions
output_file = 'math_predictions.csv'
results_df.to_csv(output_file, index=False)
print(f"\n✓ Predictions saved to '{output_file}'")