In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning imports - CLASSIFICATION ONLY
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, cohen_kappa_score
from sklearn.impute import SimpleImputer

import warnings
import pickle

warnings.filterwarnings('ignore')

# Configuration
plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

print("MOTIVATION PREDICTION MODEL")
print("=" * 60)
print("Predicting Motivation scores using behavioral features")
print("Target: Motivation (Average of PostSE and PostMAP)")
print("Features: MAP-specific and SE-specific behavioral patterns")
print("=" * 60)

# Test Data Preprocessing

In [None]:
# REPLACE WITH TEST DATASET FILE PATHS
df_main = pd.read_csv("./raw_data/training_set_with_formatted_time.csv")
df_ws = pd.read_csv("./raw_data/workspace_summary_train.csv")
df_scores = pd.read_csv("./raw_data/student_scores_train.csv")

df_main.drop_duplicates(inplace=True)
df_ws.drop_duplicates(inplace=True)
df_scores.drop_duplicates(inplace=True)

In [None]:
# Remove unnecessary columns
df_main.drop(columns=['CF..Anon.School.Id.', 'CF..Anon.Class.Id.', 'Time', 'formatted_time'], inplace=True)

# Remove rows containing 'OK_AMBIGUOUS'
df_main = df_main[df_main['Outcome'] != 'OK_AMBIGUOUS']

df_main.sort_values(by=['Anon.Student.Id', 'datetime'], inplace=True)

df_main['datetime'] = pd.to_datetime(
    df_main['datetime'],
    infer_datetime_format=True,
    errors='coerce'       # turns invalid parses into NaT
)

# Generate time steps
df_main['time_step'] = df_main.groupby('Anon.Student.Id')['datetime'].rank(method='first') - 1 

In [None]:
# REPLACE WITH TEST DESIRED FILE PATHS
df_main.to_csv('preprocessed_data/df_main_allws.csv', index=False)
df_ws.to_csv('preprocessed_data/df_ws_allws.csv', index=False)

In [None]:
workspace_ids_to_remove = [
    'worksheet_grapher_a1_lin_mod_mult_rep',
    'equation_line_2',
    'analyzing_models_2step_rationals',
    'multiple_representations_of_linear_functions',
    'worksheet_grapher_a1_slope_intercept_integer',
    'worksheet_grapher_a1_slope_intercept_decimal',
    'connecting_slope_intercept_and_point_slope_forms',
    'equation_line_1',
    'equation_line_3',
    'worksheet_grapher_a1_mod_initial_plus_point',
    'worksheet_grapher_a1_mod_two_points',
    'modeling_linear_equations_in_standard_form',
    'graph_setup_linear_equation-1',
    'graph_setup_linear_equation-2',
    'classifying_relations_and_functions',
    'introduction_to_functions',
    'graphs_of_functions',
    'graphs_of_functions-1',
    'compare_functions_diff_reps_linear_relationships'
]


df_main = df_main[~df_main['Level..Workspace.Id.'].isin(workspace_ids_to_remove)]
df_ws = df_ws[~df_ws['workspace'].isin(workspace_ids_to_remove)]

In [None]:
# REPLACE WITH DESIRED OUTPUT DIRECTORY
imputer = SimpleImputer(strategy='median')

# For MAP: Use average of Post1MAP and Post2MAP for Post3MAP
df_cleaned_map = df_scores[["Anon.Student.Id", "PreMAP", "Post1MAP", "Post2MAP"]].copy()
df_cleaned_map[["PreMAP", "Post1MAP", "Post2MAP"]] = imputer.fit_transform(df_cleaned_map[["PreMAP", "Post1MAP", "Post2MAP"]])
# Calculate Post3MAP as average of Post1MAP and Post2MAP
df_cleaned_map['Post3MAP'] = (df_cleaned_map['Post1MAP'] + df_cleaned_map['Post2MAP']) / 2

# For SE: Use average of Post1SE and Post2SE for Post3SE
df_cleaned_se = df_scores[["Anon.Student.Id", "PreSE", "Post1SE", "Post2SE"]].copy()
df_cleaned_se[["PreSE", "Post1SE", "Post2SE"]] = imputer.fit_transform(df_cleaned_se[["PreSE", "Post1SE", "Post2SE"]])
# Calculate Post3SE as average of Post1SE and Post2SE
df_cleaned_se['Post3SE'] = (df_cleaned_se['Post1SE'] + df_cleaned_se['Post2SE']) / 2

In [None]:
# REPLACE WITH DESIRED OUTPUT DIRECTORY
df_main.to_csv("preprocessed_data/df_main.csv", index=False)
df_ws.to_csv("preprocessed_data/df_ws.csv", index=False)
df_scores.to_csv("preprocessed_data/df_scores.csv", index=False)
df_cleaned_map.to_csv("preprocessed_data/df_cleaned_map.csv", index=False)
df_cleaned_se.to_csv("preprocessed_data/df_cleaned_se.csv", index=False)

# Loading Preprocessed Test Data

In [None]:
# ==============================================================================
# DATA LOADING AND VALIDATION
# ==============================================================================

print("\nLOADING AND VALIDATING DATA")
print("=" * 50)

# Load all data files
df_main = pd.read_csv('preprocessed_data/df_main_allws.csv')
df_ws = pd.read_csv('preprocessed_data/df_ws_allws.csv')
df_cleaned_map = pd.read_csv('preprocessed_data/df_cleaned_map.csv')
df_cleaned_se = pd.read_csv('preprocessed_data/df_cleaned_se.csv')

print(f"✓ Main interaction data: {len(df_main):,} records")
print(f"✓ Workspace data: {len(df_ws):,} workspace sessions")
print(f"✓ MAP assessments: {len(df_cleaned_map):,} students")
print(f"✓ SE assessments: {len(df_cleaned_se):,} students")

print(f"\nVALIDATING MOTIVATION DATA:")
print("-" * 40)
print("MAP columns:", df_cleaned_map.columns.tolist())
print("SE columns:", df_cleaned_se.columns.tolist())

# Check for Post3SE and Post3MAP specifically
missing_cols = []

if 'Post3MAP' not in df_cleaned_map.columns:
    missing_cols.append('Post3MAP')
if 'Post3SE' not in df_cleaned_se.columns:
    missing_cols.append('Post3SE')

if missing_cols:
    print(f"ERROR: Missing columns: {missing_cols}")
    print("Available MAP columns:", df_cleaned_map.columns.tolist())
    print("Available SE columns:", df_cleaned_se.columns.tolist())
else:
    print("Post3MAP found in MAP data")
    non_null_count_map = df_cleaned_map['Post3MAP'].notna().sum()
    col_range_map = f"{df_cleaned_map['Post3MAP'].min():.2f} to {df_cleaned_map['Post3MAP'].max():.2f}"
    print(f"  - Non-null values: {non_null_count_map}, Range: {col_range_map}")
    
    print("Post3SE found in SE data")
    non_null_count_se = df_cleaned_se['Post3SE'].notna().sum()
    col_range_se = f"{df_cleaned_se['Post3SE'].min():.2f} to {df_cleaned_se['Post3SE'].max():.2f}"
    print(f"  - Non-null values: {non_null_count_se}, Range: {col_range_se}")

In [None]:
# ==============================================================================
# TARGET VARIABLE CREATION - MOTIVATION SCORES
# ==============================================================================

print("\nCREATING MOTIVATION TARGET VARIABLE")
print("=" * 50)

# Merge the two datasets on student ID
df_combined = df_cleaned_se.merge(df_cleaned_map, on='Anon.Student.Id', how='inner')

# Filter students with both PostSE and PostMAP scores
motivation_data = df_combined[
    df_combined['Post3SE'].notna() & 
    df_combined['Post3MAP'].notna()
].copy()


print(f"Students with both PostSE and PostMAP scores: {len(motivation_data)}")

# Create motivation score as average of PostSE and PostMAP
motivation_data['motivation_score'] = (motivation_data['Post3SE'] + motivation_data['Post3MAP']) / 2

# Create categorical version (Low, Medium, High)
def categorize_motivation(score):
    if score <= 4.0:
        return 'Low'
    elif score <= 5.5:
        return 'Medium'
    else:
        return 'High'

motivation_data['motivation_category'] = motivation_data['motivation_score'].apply(categorize_motivation)

# Show distribution
print(f"\nMotivation score statistics:")
print(f"  Mean: {motivation_data['motivation_score'].mean():.2f}")
print(f"  Std: {motivation_data['motivation_score'].std():.2f}")
print(f"  Range: {motivation_data['motivation_score'].min():.2f} - {motivation_data['motivation_score'].max():.2f}")

print(f"\nIndividual component statistics:")
print(f"  Post3SE - Mean: {motivation_data['Post3SE'].mean():.2f}, Std: {motivation_data['Post3SE'].std():.2f}")
print(f"  Post3MAP - Mean: {motivation_data['Post3MAP'].mean():.2f}, Std: {motivation_data['Post3MAP'].std():.2f}")

category_counts = motivation_data['motivation_category'].value_counts()
print(f"\nMotivation category distribution:")
for category, count in category_counts.items():
    percentage = (count / len(motivation_data)) * 100
    print(f"  • {category}: {count} students ({percentage:.1f}%)")

print(f"\nTarget variable created successfully!")


In [None]:
# ==============================================================================
# FEATURE ENGINEERING - MAP & SE FOCUSED FEATURES
# ==============================================================================

print("\nENGINEERING MAP & SE FOCUSED FEATURES")
print("=" * 50)

# Get students with motivation scores
target_students = motivation_data['Anon.Student.Id'].unique()
df_filtered = df_main[df_main['Anon.Student.Id'].isin(target_students)].copy()
df_ws_filtered = df_ws[df_ws['Anon.Student.Id'].isin(target_students)].copy()

print(f"Filtering data for {len(target_students)} students with motivation scores")
print(f"Behavioral data: {len(df_filtered):,} interactions")
print(f"Workspace data: {len(df_ws_filtered):,} workspace sessions")

# Feature engineering by student
features_list = []

for student_id in target_students:
    student_data = df_filtered[df_filtered['Anon.Student.Id'] == student_id].copy()
    student_ws = df_ws_filtered[df_ws_filtered['Anon.Student.Id'] == student_id].copy()
    
    if len(student_data) == 0:
        continue
    
    features = {'Anon.Student.Id': student_id}
    
    # ==========================================================================
    # MAP-RELATED FEATURES (Mastery-Oriented Behavior)
    # ==========================================================================
    
    # 1. Sessions per Student, balanced by average problems solved per session
    unique_sessions = student_data['Session.Id'].nunique()
    total_problems = student_data['Problem.Name'].nunique()
    avg_problems_per_session = total_problems / unique_sessions if unique_sessions > 0 else 0
    features['sessions_balanced_by_problems'] = unique_sessions * (avg_problems_per_session / 10)  # Normalize
    
    # 2. Number of Problems Attempted
    features['problems_attempted'] = total_problems
    
    # 3. Percentage of Graduated Workspaces
    if len(student_ws) > 0:
        graduated_ws = student_ws['Graduate'].sum() if 'Graduate' in student_ws.columns else 0
        total_ws = len(student_ws)
        features['graduated_workspaces_pct'] = graduated_ws / total_ws if total_ws > 0 else 0
    else:
        features['graduated_workspaces_pct'] = 0
    
    # 4. Hint Usage Pattern (strategic use of hints)
    hint_followed_by_success = 0
    total_hint_instances = 0
    
    for problem in student_data['Problem.Name'].unique():
        problem_data = student_data[student_data['Problem.Name'] == problem].sort_values('time_step')
        
        for i in range(len(problem_data) - 1):
            current_action = str(problem_data.iloc[i]['Action']).lower()
            next_outcome = str(problem_data.iloc[i + 1]['Outcome']).upper()
            
            if 'hint' in current_action:
                total_hint_instances += 1
                if next_outcome in ['OK', 'CORRECT']:
                    hint_followed_by_success += 1
    
    features['hint_usage_pattern'] = hint_followed_by_success / total_hint_instances if total_hint_instances > 0 else 0
    
    # 5. Change in Help Level Over Time (learning progression)
    if 'Help.Level' in student_data.columns:
        help_levels = student_data['Help.Level'].dropna()
        if len(help_levels) > 1:
            # Calculate trend: negative means decreasing help needs over time
            x = np.arange(len(help_levels))
            slope = np.polyfit(x, help_levels, 1)[0]
            features['help_level_change'] = -slope  # Negative slope is good (less help over time)
        else:
            features['help_level_change'] = 0
    else:
        features['help_level_change'] = 0
    
    # 6. Step Completion Rate
    completed_steps = student_data['Outcome'].isin(['OK', 'CORRECT']).sum()
    total_steps = len(student_data)
    features['step_completion_rate'] = completed_steps / total_steps if total_steps > 0 else 0
    
    # ==========================================================================
    # SE-RELATED FEATURES (Self-Efficacy)
    # ==========================================================================
    
    # 7. Ratio of Attempts to Hint Requests
    attempt_actions = student_data['Action'].str.contains('attempt|submit', case=False, na=False).sum()
    hint_actions = student_data['Action'].str.contains('hint', case=False, na=False).sum()
    features['attempt_to_hint_ratio'] = attempt_actions / hint_actions if hint_actions > 0 else attempt_actions
    
    # 8. Quick Return After Error (resilience)
    quick_returns = 0
    error_instances = 0
    
    for i in range(len(student_data) - 1):
        if student_data.iloc[i]['Outcome'] in ['ERROR', 'WRONG']:
            error_instances += 1
            time_diff = student_data.iloc[i + 1]['time_step'] - student_data.iloc[i]['time_step']
            if time_diff < 30:  # Quick return within 30 seconds
                quick_returns += 1
    
    features['quick_return_rate'] = quick_returns / error_instances if error_instances > 0 else 1
    
    # 9. Average Help Level (lower = more self-efficacy)
    if 'Help.Level' in student_data.columns:
        help_levels = student_data['Help.Level'].dropna()
        help_levels_nonzero = help_levels[help_levels > 0]
        features['avg_help_level'] = help_levels_nonzero.mean() if len(help_levels_nonzero) > 0 else 0
    else:
        features['avg_help_level'] = 0
    
    # 10. Proportion of Low-Level Hints (SE indicator)
    if 'Help.Level' in student_data.columns:
        hint_requests = student_data[student_data['Action'].str.contains('hint', case=False, na=False)]
        if len(hint_requests) > 0:
            low_level_hints = hint_requests['Help.Level'].isin([0, 1]).sum()
            features['low_level_hints_prop'] = low_level_hints / len(hint_requests)
        else:
            features['low_level_hints_prop'] = 0
    else:
        features['low_level_hints_prop'] = 0
    
    # 11. Proportion of High-Level Hints (lower SE indicator)
    if 'Help.Level' in student_data.columns:
        hint_requests = student_data[student_data['Action'].str.contains('hint', case=False, na=False)]
        if len(hint_requests) > 0:
            high_level_hints = hint_requests['Help.Level'].isin([4, 5]).sum()
            features['high_level_hints_prop'] = high_level_hints / len(hint_requests)
        else:
            features['high_level_hints_prop'] = 0
    else:
        features['high_level_hints_prop'] = 0
    
    # ==========================================================================
    # ADDITIONAL MOTIVATION-RELATED FEATURES
    # ==========================================================================
    
    # 12. Session Persistence
    features['session_persistence'] = unique_sessions
    
    # 13. Total Interaction Time
    if len(student_data) > 1:
        features['total_interaction_time'] = student_data['time_step'].max() - student_data['time_step'].min()
    else:
        features['total_interaction_time'] = 0
    
    # 14. Problem-solving Efficiency
    avg_time_per_problem = features['total_interaction_time'] / total_problems if total_problems > 0 else 0
    features['problem_solving_efficiency'] = 1 / (1 + avg_time_per_problem / 60)  # Normalize to minutes
    
    features_list.append(features)

# Create features dataframe
features_df = pd.DataFrame(features_list)
print(f"\nMotivation features engineered for {len(features_df)} students")
print(f"Total features: {len(features_df.columns) - 1}")

print("\nMAP-FOCUSED FEATURES:")
print("-" * 30)
map_features = [
    'sessions_balanced_by_problems', 'problems_attempted', 'graduated_workspaces_pct',
    'hint_usage_pattern', 'help_level_change', 'step_completion_rate'
]

for feat in map_features:
    if feat in features_df.columns:
        mean_val = features_df[feat].mean()
        print(f"  ✓ {feat}: mean = {mean_val:.3f}")

print("\nSE-FOCUSED FEATURES:")
print("-" * 30)
se_features = [
    'attempt_to_hint_ratio', 'quick_return_rate', 'avg_help_level',
    'low_level_hints_prop', 'high_level_hints_prop'
]

for feat in se_features:
    if feat in features_df.columns:
        mean_val = features_df[feat].mean()
        print(f"  ✓ {feat}: mean = {mean_val:.3f}")

# Display feature statistics
print(f"\nFeature summary:")
print(features_df.describe().round(3))


In [None]:
# ==============================================================================
# DATA MERGING AND PREPARATION
# ==============================================================================

print("\nMERGING FEATURES WITH TARGET VARIABLE")
print("=" * 50)

# Merge features with motivation scores
df = features_df.merge(
    motivation_data[['Anon.Student.Id', 'motivation_score', 'motivation_category', 'Post3SE', 'Post3MAP']], 
    on='Anon.Student.Id', how='inner'
)

print(f"Final dataset: {len(df)} students")
print(f"Features: {len(df.columns) - 5}")  # Excluding ID and target columns

# Handle missing values
print(f"\nMissing values per column:")
missing_counts = df.isnull().sum()
for col, count in missing_counts.items():
    if count > 0:
        print(f"  {col}: {count}")

# Fill missing values with median for numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns
numerical_cols = [col for col in numerical_cols if col not in ['motivation_score', 'PostSE', 'PostMAP']]

postse_median = df['Post3SE'].median()
postmap_median = df['Post3MAP'].median()

imputer = SimpleImputer(strategy='median')
df[numerical_cols] = imputer.fit_transform(df[numerical_cols])

print(f"\nData preparation complete!")
print(f"Shape: {df.shape}")


In [None]:
# ==============================================================================
# QWK METRIC DEFINITION
# ==============================================================================

def qwk(y_true, y_pred):
    """Quadratic Weighted Kappa - primary metric for ordinal classification"""
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def qwk_scorer(estimator, X, y):
    """Custom scorer for cross-validation"""
    return qwk(y, estimator.predict(X))

print("QWK metric defined for ordinal classification evaluation")


In [None]:
# ==============================================================================
# LOAD TRAINED MODEL AND MAKE PREDICTIONS
# ==============================================================================

import pickle
import pandas as pd
import numpy as np

print("Loading trained motivation model...")

# Load the saved model and feature columns
with open('motivation_model.pkl', 'rb') as f:
    model, feature_cols = pickle.load(f)

print(f"✓ Model loaded successfully!")
print(f"✓ Model type: {type(model).__name__}")
print(f"✓ Number of features: {len(feature_cols)}")

# Prepare features for prediction
print("\nPreparing features for prediction...")

# Select only the features used during training (excluding target variables)
X_test = df[feature_cols].copy()

# Handle missing values (same as training)
print("Handling missing values...")
X_test = X_test.replace([np.inf, -np.inf], np.nan)
X_test = X_test.fillna(0)  # Fill with 0 for missing values

print(f"✓ Test features shape: {X_test.shape}")
print(f"✓ Features with missing values: {X_test.isnull().sum().sum()}")

# Make predictions
print("\nMaking predictions...")
predictions = model.predict(X_test)

print(f"✓ Predictions made for {len(predictions)} students")
print(f"✓ Prediction range: {predictions.min():.2f} to {predictions.max():.2f}")

# Create results dataframe
results_df = pd.DataFrame({
    'Anon.Student.Id': df['Anon.Student.Id'],
    'PostMotivation3_Predicted': predictions
})

print(f"\nResults shape: {results_df.shape}")
print("\nFirst 10 predictions:")
print(results_df.head(10))

# Save predictions
output_file = 'motivation_predictions.csv'
results_df.to_csv(output_file, index=False)
print(f"\n✓ Predictions saved to '{output_file}'")