In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning imports - CLASSIFICATION ONLY
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, cohen_kappa_score
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier

import warnings
import pickle
warnings.filterwarnings('ignore')

# Configuration
plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

print("METACOGNITION PREDICTION MODEL")
print("=" * 60)
print("Predicting Post3Meta scores using behavioral features")
print("Target: Post3Meta (Metacognition Assessment)")
print("Features: Learning behavior patterns and help-seeking")
print("=" * 60)

# Test Data Preprocessing

In [None]:
# REPLACE WITH TEST DATASET FILE PATHS
df_main = pd.read_csv("./raw_data/training_set_with_formatted_time.csv")
df_ws = pd.read_csv("./raw_data/workspace_summary_train.csv")
df_scores = pd.read_csv("./raw_data/student_scores_train.csv")

df_main.drop_duplicates(inplace=True)
df_ws.drop_duplicates(inplace=True)
df_scores.drop_duplicates(inplace=True)

In [None]:
# Remove unnecessary columns
df_main.drop(columns=['CF..Anon.School.Id.', 'CF..Anon.Class.Id.', 'Time', 'formatted_time'], inplace=True)

# Remove rows containing 'OK_AMBIGUOUS'
df_main = df_main[df_main['Outcome'] != 'OK_AMBIGUOUS']

df_main.sort_values(by=['Anon.Student.Id', 'datetime'], inplace=True)

df_main['datetime'] = pd.to_datetime(
    df_main['datetime'],
    infer_datetime_format=True,
    errors='coerce'       # turns invalid parses into NaT
)

# Generate time steps
df_main['time_step'] = df_main.groupby('Anon.Student.Id')['datetime'].rank(method='first') - 1 

In [None]:
# REPLACE WITH TEST DESIRED FILE PATHS
df_main.to_csv('preprocessed_data/df_main_allws.csv', index=False)
df_ws.to_csv('preprocessed_data/df_ws_allws.csv', index=False)

In [None]:
workspace_ids_to_remove = [
    'worksheet_grapher_a1_lin_mod_mult_rep',
    'equation_line_2',
    'analyzing_models_2step_rationals',
    'multiple_representations_of_linear_functions',
    'worksheet_grapher_a1_slope_intercept_integer',
    'worksheet_grapher_a1_slope_intercept_decimal',
    'connecting_slope_intercept_and_point_slope_forms',
    'equation_line_1',
    'equation_line_3',
    'worksheet_grapher_a1_mod_initial_plus_point',
    'worksheet_grapher_a1_mod_two_points',
    'modeling_linear_equations_in_standard_form',
    'graph_setup_linear_equation-1',
    'graph_setup_linear_equation-2',
    'classifying_relations_and_functions',
    'introduction_to_functions',
    'graphs_of_functions',
    'graphs_of_functions-1',
    'compare_functions_diff_reps_linear_relationships'
]


df_main = df_main[~df_main['Level..Workspace.Id.'].isin(workspace_ids_to_remove)]
df_ws = df_ws[~df_ws['workspace'].isin(workspace_ids_to_remove)]

In [None]:
# REPLACE WITH TEST DESIRED FILE PATHS AND NAMES
imputer = SimpleImputer(strategy='median')

df_cleaned_meta = df_scores[["Anon.Student.Id", "PreMeta", "Post1Meta", "Post2Meta"]].copy()
df_cleaned_meta[["PreMeta", "Post1Meta", "Post2Meta"]] = imputer.fit_transform(df_cleaned_meta[["PreMeta", "Post1Meta", "Post2Meta"]])

# Create average of Post1Meta and Post2Meta
df_cleaned_meta['Post1Post2_Avg'] = df_cleaned_meta[['Post1Meta', 'Post2Meta']].mean(axis=1)

In [None]:
# convert all to csv and store in processed_data folder
df_main.to_csv("preprocessed_data/df_main.csv", index=False)
df_ws.to_csv("preprocessed_data/df_ws.csv", index=False)
df_scores.to_csv("preprocessed_data/df_scores.csv", index=False)
df_cleaned_meta.to_csv("preprocessed_data/df_cleaned_meta.csv", index=False)

# Loading Preprocessed Test Data

In [None]:
# ==============================================================================
# DATA LOADING AND VALIDATION
# ==============================================================================

print("\nLOADING AND VALIDATING DATA")
print("=" * 50)

# Load all data files
df_main = pd.read_csv('preprocessed_data/df_main_allws.csv')
df_ws = pd.read_csv('preprocessed_data/df_ws_allws.csv')
df_cleaned_meta = pd.read_csv('preprocessed_data/df_cleaned_meta.csv')

print(f"✓ Main interaction data: {len(df_main):,} records")
print(f"✓ Workspace data: {len(df_ws):,} workspace sessions")
print(f"✓ Metacognition assessments: {len(df_cleaned_meta):,} students")

print(f"\nVALIDATING METACOGNITION DATA:")
print("-" * 40)
print("Meta columns:", df_cleaned_meta.columns.tolist())

# Check for Post1Post2_Avg specifically
if 'Post1Post2_Avg' not in df_cleaned_meta.columns:
    print("ERROR: Post1Post2_Avg column not found!")
    print("Available Meta columns:", [col for col in df_cleaned_meta.columns if 'Meta' in col or 'Avg' in col])
else:
    print(f"✓ Post1Post2_Avg found: {df_cleaned_meta['Post1Post2_Avg'].notna().sum()} non-null values")
    print(f"  Range: {df_cleaned_meta['Post1Post2_Avg'].min():.2f} to {df_cleaned_meta['Post1Post2_Avg'].max():.2f}")
    print(f"  Post1Meta mean: {df_cleaned_meta['Post1Meta'].mean():.2f}")
    print(f"  Post2Meta mean: {df_cleaned_meta['Post2Meta'].mean():.2f}")

In [None]:
# ==============================================================================
# TARGET VARIABLE CREATION
# ==============================================================================

print("\nCREATING TARGET VARIABLE")
print("=" * 50)

# Filter students with Post1Post2_Avg scores
metacognition_data = df_cleaned_meta[df_cleaned_meta['PreMeta'].notna() & df_cleaned_meta['Post1Post2_Avg'].notna()].copy()
print(f"Students with Post1Post2_Avg scores: {len(metacognition_data)}")

# Create both continuous and categorical targets
metacognition_data['metacognition_score'] = metacognition_data['Post1Post2_Avg']

# Create categorical version (Low, Medium, High)
def categorize_metacognition(score):
    if score <= 4.0:
        return 'Low'
    elif score <= 5.5:
        return 'Medium'
    else:
        return 'High'

metacognition_data['metacognition_category'] = metacognition_data['metacognition_score'].apply(categorize_metacognition)

# Show distribution
print(f"\nMetacognition score statistics:")
print(f"  Mean: {metacognition_data['metacognition_score'].mean():.2f}")
print(f"  Std: {metacognition_data['metacognition_score'].std():.2f}")
print(f"  Range: {metacognition_data['metacognition_score'].min():.2f} - {metacognition_data['metacognition_score'].max():.2f}")

category_counts = metacognition_data['metacognition_category'].value_counts()
print(f"\nMetacognition category distribution:")
for category, count in category_counts.items():
    percentage = (count / len(metacognition_data)) * 100
    print(f"  • {category}: {count} students ({percentage:.1f}%)")

print(f"\nTarget variable created successfully!")

In [None]:
# ==============================================================================
# FEATURE ENGINEERING - METACOGNITION-FOCUSED FEATURES
# ==============================================================================

print("\nENGINEERING METACOGNITION-FOCUSED FEATURES")
print("=" * 50)

# Get students with metacognition scores
target_students = metacognition_data['Anon.Student.Id'].unique()
df_filtered = df_main[df_main['Anon.Student.Id'].isin(target_students)].copy()

print(f"Filtering data for {len(target_students)} students with metacognition scores")
print(f"Behavioral data: {len(df_filtered):,} interactions")

# Feature engineering by student
features_list = []

for student_id in target_students:
    student_data = df_filtered[df_filtered['Anon.Student.Id'] == student_id].copy()
    
    if len(student_data) == 0:
        continue
    
    features = {'Anon.Student.Id': student_id}
    
    # 1. Average Help Level (exclude 0 if 0 = no hint)
    if 'Help.Level' in student_data.columns:
        help_levels = student_data['Help.Level'].dropna()
        help_levels_nonzero = help_levels[help_levels > 0]  # Exclude 0 if 0 = no hint
        features['avg_help_level'] = help_levels_nonzero.mean() if len(help_levels_nonzero) > 0 else 0
    else:
        features['avg_help_level'] = 0
    
    # 2. Hint Usage Pattern (% of problems with Hint Requested followed by OK/WRONG)
    hint_pattern_count = 0
    total_problems = student_data['Problem.Name'].nunique()
    
    for problem in student_data['Problem.Name'].unique():
        problem_data = student_data[student_data['Problem.Name'] == problem].sort_values('time_step')
        
        # Look for hint request followed by attempt
        for i in range(len(problem_data) - 1):
            current_action = str(problem_data.iloc[i]['Action']).lower()
            next_outcome = str(problem_data.iloc[i + 1]['Outcome']).upper()
            
            if 'hint' in current_action and next_outcome in ['OK', 'CORRECT', 'WRONG', 'ERROR']:
                hint_pattern_count += 1
                break
    
    features['hint_usage_pattern'] = hint_pattern_count / total_problems if total_problems > 0 else 0
    
    # 3. Optimal Help Seeking Behavior (time spent + accuracy + hint usage)
    # Time spent on problem
    problem_times = []
    problem_accuracy = []
    problem_hint_usage = []
    
    for problem in student_data['Problem.Name'].unique():
        problem_data = student_data[student_data['Problem.Name'] == problem].sort_values('time_step')
        
        if len(problem_data) > 1:
            time_spent = problem_data['time_step'].max() - problem_data['time_step'].min()
            problem_times.append(time_spent)
        
        # Accuracy for this problem
        correct_attempts = (problem_data['Outcome'].isin(['OK', 'CORRECT'])).sum()
        total_attempts = len(problem_data)
        accuracy = correct_attempts / total_attempts if total_attempts > 0 else 0
        problem_accuracy.append(accuracy)
        
        # Hint usage for this problem
        hint_requests = problem_data['Action'].str.contains('hint', case=False, na=False).sum()
        problem_hint_usage.append(hint_requests)
    
    features['avg_time_per_problem'] = np.mean(problem_times) if problem_times else 0
    features['avg_problem_accuracy'] = np.mean(problem_accuracy) if problem_accuracy else 0
    features['avg_hints_per_problem'] = np.mean(problem_hint_usage) if problem_hint_usage else 0
    
    # Combined optimal help seeking score
    # Higher time + higher accuracy + moderate hint usage = better metacognition
    time_score = min(features['avg_time_per_problem'] / 60, 1)  # Normalize to max 1 minute
    accuracy_val = features['avg_problem_accuracy']
    hint_score = 1 - min(features['avg_hints_per_problem'] / 3, 1)  # Penalize excessive hints
    
    features['optimal_help_seeking'] = (time_score + accuracy_val + hint_score) / 3
    
    # 4. Length of time spent on Concept Building
    # Approximate concept building as time spent on problems with multiple attempts
    concept_building_time = 0
    for problem in student_data['Problem.Name'].unique():
        problem_data = student_data[student_data['Problem.Name'] == problem].sort_values('time_step')
        
        if len(problem_data) > 2:  # Multiple attempts suggest concept building
            time_spent = problem_data['time_step'].max() - problem_data['time_step'].min()
            concept_building_time += time_spent
    
    features['concept_building_time'] = concept_building_time
    
    # Additional metacognition-related features
    # 5. Self-regulation indicators
    features['total_problems_attempted'] = total_problems
    features['avg_attempts_per_problem'] = len(student_data) / total_problems if total_problems > 0 else 0
    features['error_recovery_rate'] = 0
    
    # Error recovery: correct answer after error
    for problem in student_data['Problem.Name'].unique():
        problem_data = student_data[student_data['Problem.Name'] == problem].sort_values('time_step')
        
        for i in range(len(problem_data) - 1):
            if problem_data.iloc[i]['Outcome'] in ['ERROR', 'WRONG']:
                if problem_data.iloc[i + 1]['Outcome'] in ['OK', 'CORRECT']:
                    features['error_recovery_rate'] += 1
                    break
    
    features['error_recovery_rate'] = features['error_recovery_rate'] / total_problems if total_problems > 0 else 0
    
    # 6. Persistence indicators
    features['session_persistence'] = student_data['Session.Id'].nunique()  # Number of different sessions
    features['total_interaction_time'] = student_data['time_step'].max() - student_data['time_step'].min() if len(student_data) > 1 else 0
    
    # 7. Strategic help-seeking (help level progression)
    if 'Help.Level' in student_data.columns:
        help_progression = []
        for problem in student_data['Problem.Name'].unique():
            problem_data = student_data[student_data['Problem.Name'] == problem].sort_values('time_step')
            help_levels = problem_data['Help.Level'].dropna()
            
            if len(help_levels) > 1:
                # Check if help levels increase (strategic escalation)
                increasing = (help_levels.diff() > 0).sum()
                help_progression.append(increasing / len(help_levels))
        
        features['strategic_help_progression'] = np.mean(help_progression) if help_progression else 0
    else:
        features['strategic_help_progression'] = 0
    
    features_list.append(features)

# Create features dataframe
features_df = pd.DataFrame(features_list)
print(f"\nMetacognition features engineered for {len(features_df)} students")
print(f"Total features: {len(features_df.columns) - 1}")

print("\nMETACOGNITION-FOCUSED FEATURES CREATED:")
print("-" * 50)
metacog_features = [
    'avg_help_level', 'hint_usage_pattern', 'optimal_help_seeking', 
    'concept_building_time', 'avg_time_per_problem', 'avg_problem_accuracy',
    'error_recovery_rate', 'strategic_help_progression'
]

for feat in metacog_features:
    if feat in features_df.columns:
        mean_val = features_df[feat].mean()
        print(f"  ✓ {feat}: mean = {mean_val:.3f}")

# Display feature statistics
print(f"\nFeature summary:")
print(features_df.describe().round(3))

In [None]:
# ==============================================================================
# DATA MERGING AND PREPARATION
# ==============================================================================

print("\nMERGING FEATURES WITH TARGET VARIABLE")
print("=" * 50)

# Merge features with metacognition scores (including Post1Post2_Avg as a feature)
df = features_df.merge(metacognition_data[['Anon.Student.Id', 'metacognition_score', 'metacognition_category', 'Post1Post2_Avg']], 
                       on='Anon.Student.Id', how='inner')

print(f"Final dataset: {len(df)} students")
print(f"Features: {len(df.columns) - 4}")  # Excluding ID, target, and Post1Post2_Avg columns

# Handle missing values
print(f"\nMissing values per column:")
missing_counts = df.isnull().sum()
for col, count in missing_counts.items():
    if count > 0:
        print(f"  {col}: {count}")

# Fill missing values with median for numerical columns (excluding target and Post1Post2_Avg)
numerical_cols = df.select_dtypes(include=[np.number]).columns
numerical_cols = [col for col in numerical_cols if col not in ['metacognition_score', 'Post1Post2_Avg']]

imputer = SimpleImputer(strategy='median')
df[numerical_cols] = imputer.fit_transform(df[numerical_cols])

print(f"\nData preparation complete!")
print(f"Shape: {df.shape}")

In [None]:
# ==============================================================================
# QWK METRIC DEFINITION
# ==============================================================================

def qwk(y_true, y_pred):
    """Quadratic Weighted Kappa - primary metric for ordinal classification"""
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def qwk_scorer(estimator, X, y):
    """Custom scorer for cross-validation"""
    return qwk(y, estimator.predict(X))

print("QWK metric defined for ordinal classification evaluation")

In [None]:
# ==============================================================================
# LOAD TRAINED MODEL AND MAKE PREDICTIONS
# ==============================================================================
print("Loading trained meta model...")

# Load the saved model, label encoder, and feature columns
with open('meta_model.pkl', 'rb') as f:
    model, le, feature_cols = pickle.load(f)

print(f"✓ Model loaded successfully!")
print(f"✓ Model type: {type(model).__name__}")
print(f"✓ Number of features: {len(feature_cols)}")
print(f"✓ Label encoder loaded: {type(le).__name__}")

# Prepare features for prediction
print("\nPreparing features for prediction...")

# Select only the features used during training
X_test = df[feature_cols].copy()

# Handle missing values (same as training)
print("Handling missing values...")
X_test = X_test.replace([np.inf, -np.inf], np.nan)
X_test = X_test.fillna(0)  # Fill with 0 for missing values

print(f"✓ Test features shape: {X_test.shape}")
print(f"✓ Features with missing values: {X_test.isnull().sum().sum()}")

# Make predictions
print("\nMaking predictions...")
predictions = model.predict(X_test)

# Convert predictions back to original scale using label encoder
if hasattr(le, 'inverse_transform'):
    predictions_original = le.inverse_transform(predictions)
else:
    predictions_original = predictions

print(f"✓ Predictions made for {len(predictions)} students")
print(f"✓ Prediction range: {predictions_original.min():.2f} to {predictions_original.max():.2f}")

# Create results dataframe
results_df = pd.DataFrame({
    'Anon.Student.Id': df['Anon.Student.Id'],
    'Post3Meta_Predicted': predictions_original
})

print(f"\nResults shape: {results_df.shape}")
print("\nFirst 10 predictions:")
print(results_df.head(10))

# Save predictions
output_file = 'meta_predictions.csv'
results_df.to_csv(output_file, index=False)
print(f"\n✓ Predictions saved to '{output_file}'")