In [1]:
from pyspark.sql import SparkSession

import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Initialize Spark session
spark = (SparkSession.builder.appName("DataProcessingApp")
         .config("spark.executor.memory", "12g")
         .config("spark.driver.memory", "12g")
         .getOrCreate())
spark.sparkContext.setLogLevel("DEBUG")

problem_df = spark.read.csv('data/2019-2020_school_year/pdets_no_null.csv', header=True, inferSchema=True)

In [2]:
from pyspark.sql.functions import split, col

# Read the CSV file into a Spark DataFrame
feedback_df = spark.read.csv('data/2019-2020_school_year/final_matrix.csv', header=True, inferSchema=True)

# Select columns that start with a digit
category_cols = [col for col in feedback_df.columns if col[0].isdigit()]

# Melt the DataFrame
feedback_long = feedback_df.selectExpr("student_id", "stack(" + str(len(category_cols)) + ", " + 
                                       ", ".join([f"'{col}', {col}" for col in category_cols]) + 
                                       ") as (category, performance)")

# Split the category column into grade, domain, subdomain
feedback_long = feedback_long.withColumn('grade', split(col('category'), '_').getItem(0).cast('int'))
feedback_long = feedback_long.withColumn('domain', split(col('category'), '_').getItem(1))
feedback_long = feedback_long.withColumn('subdomain', split(col('category'), '_').getItem(2))

# Drop the original category column
feedback_df = feedback_long.drop('category')

In [3]:
feedback_df.sort('student_id').show(5)

+----------+------------------+-----+------+---------+
|student_id|       performance|grade|domain|subdomain|
+----------+------------------+-----+------+---------+
|        16|0.8717950612926478|    1|     G|        A|
|        16|0.8660582558606779|    1|    OA|        A|
|        16|0.8717950612926478|    1|    MD|        A|
|        16|0.8748839089842363|    1|   NBT|        A|
|        16|0.8748839089842363|    1|   NBT|        B|
+----------+------------------+-----+------+---------+
only showing top 5 rows



In [4]:
from pyspark.sql.functions import col, expr

# Calculate Q1, Q3, and IQR
quantiles = problem_df.approxQuantile("mean_time_on_task", [0.25, 0.75], 0.05)
Q1, Q3 = quantiles[0], quantiles[1]
IQR = Q3 - Q1

# Define bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers (if needed)
filtered_df = problem_df.filter((col("mean_time_on_task") >= lower_bound) & (col("mean_time_on_task") <= upper_bound))

In [5]:
from pyspark.sql.functions import col, when

# Clip the values
problem_df = problem_df.withColumn("mean_time_on_task",
                   when(problem_df.mean_time_on_task < lower_bound, lower_bound)
                   .when(col("mean_time_on_task") > upper_bound, upper_bound)
                   .otherwise(col("mean_time_on_task")))

# Normalize between 0 and 1
min_val = problem_df.agg({"mean_time_on_task": "min"}).collect()[0][0]
max_val = problem_df.agg({"mean_time_on_task": "max"}).collect()[0][0]
problem_df = problem_df.withColumn("mean_time_on_task",
                   (col("mean_time_on_task") - min_val) / (max_val - min_val))

In [6]:
from pyspark.sql.functions import mean, col

difficulty_features = ['mean_correct', 'mean_time_on_task']
for column in difficulty_features:
    avg = problem_df.select(mean(col(column)).alias('mean')).first()['mean']
    problem_df = problem_df.fillna({column: avg})

problem_df.show(5)

+----------+--------------------+--------------------+-------------------+-------------------+-----+------+---------+
|problem_id|        problem_type|student_answer_count|       mean_correct|  mean_time_on_task|grade|domain|subdomain|
+----------+--------------------+--------------------+-------------------+-------------------+-----+------+---------+
|     13623|     Multiple Choice|                  27| 0.8888888888888888| 0.3730991995021376|    7|     G|        B|
|     20683|     Multiple Choice|                  22| 0.8181818181818182| 0.3730991995021376|    8|    NS|        A|
|     23271|     Multiple Choice|                  19|0.15789473684210525| 0.3730991995021376|    7|    RP|        A|
|     47084|Algebraic Expression|                   5|                0.4|                1.0|    6|    SP|        B|
|     54190|Algebraic Expression|                  55| 0.9444444444444444|0.06929991797386947|    7|    NS|        A|
+----------+--------------------+--------------------+--

In [7]:
from pyspark.sql.functions import isnan, when, count
problem_df.select([count(when(col(c).isNull(), c)).alias(c) for c in problem_df.columns]).show()

+----------+------------+--------------------+------------+-----------------+-----+------+---------+
|problem_id|problem_type|student_answer_count|mean_correct|mean_time_on_task|grade|domain|subdomain|
+----------+------------+--------------------+------------+-----------------+-----+------+---------+
|         0|           0|                   0|           0|                0|76652| 76652|    76652|
+----------+------------+--------------------+------------+-----------------+-----+------+---------+



In [8]:
problem_df = problem_df.dropna()

In [9]:
from pyspark.ml.feature import StringIndexer

# Create indexers for categorical features
indexers = {
    'grade': StringIndexer(inputCol='grade', outputCol='grade_idx', handleInvalid='keep'),
    'domain': StringIndexer(inputCol='domain', outputCol='domain_idx', handleInvalid='keep'),
    'subdomain': StringIndexer(inputCol='subdomain', outputCol='subdomain_idx', handleInvalid='keep')
}

In [10]:
# Fit and transform indexers on problem data
problems_indexed = problem_df
feedback_indexed = feedback_df

for indexer in indexers.values():
    indexer_model = indexer.fit(problem_df.select(indexer.getInputCol()).union(feedback_df.select(indexer.getInputCol())))
    problems_indexed = indexer_model.transform(problems_indexed)
    feedback_indexed = indexer_model.transform(feedback_indexed)

In [11]:
problem_df.describe().show()

+-------+-----------------+--------------------+--------------------+------------------+--------------------+------------------+------+---------+
|summary|       problem_id|        problem_type|student_answer_count|      mean_correct|   mean_time_on_task|             grade|domain|subdomain|
+-------+-----------------+--------------------+--------------------+------------------+--------------------+------------------+------+---------+
|  count|            64843|               64843|               64843|             64843|               64843|             64843| 64843|    64843|
|   mean|885215.8859553074|                NULL|  149.84889656555063|0.6418865939250317| 0.38576570288812234| 5.967427217754504|  NULL|     NULL|
| stddev|584964.4632676493|                NULL|  272.67700972866464|0.2367674371140991| 0.27091740906924844|1.8471927482757218|  NULL|     NULL|
|    min|                1|Algebraic Expression|                   1|               0.0|3.065406030874406E-6|               

In [12]:
from pyspark.ml.feature import VectorAssembler

# Calculate problem difficulty
difficulty_features = ['mean_correct', 'mean_time_on_task']
# Create difficulty vector
difficulty_assembler = VectorAssembler(
    inputCols=difficulty_features,
    outputCol="difficulty_features"
)
problems_with_features = difficulty_assembler.transform(problems_indexed)
problems_with_features.show(5)

+----------+--------------------+--------------------+-------------------+-------------------+-----+------+---------+---------+----------+-------------+--------------------+
|problem_id|        problem_type|student_answer_count|       mean_correct|  mean_time_on_task|grade|domain|subdomain|grade_idx|domain_idx|subdomain_idx| difficulty_features|
+----------+--------------------+--------------------+-------------------+-------------------+-----+------+---------+---------+----------+-------------+--------------------+
|     13623|     Multiple Choice|                  27| 0.8888888888888888| 0.3730991995021376|    7|     G|        B|      7.0|       2.0|          1.0|[0.88888888888888...|
|     20683|     Multiple Choice|                  22| 0.8181818181818182| 0.3730991995021376|    8|    NS|        A|      4.0|       7.0|          0.0|[0.81818181818181...|
|     23271|     Multiple Choice|                  19|0.15789473684210525| 0.3730991995021376|    7|    RP|        A|      7.0|   

In [13]:
from pyspark.ml.clustering import KMeans

# Cluster problems by difficulty
kmeans = KMeans(k=3, featuresCol="difficulty_features")
difficulty_model = kmeans.fit(problems_with_features)
problems_with_difficulty = difficulty_model.transform(problems_with_features)

# Add difficulty labels
problems_final = problems_with_difficulty.withColumn(
    'difficulty_label',
    when(col('prediction') == 0, 'Easy')
    .when(col('prediction') == 1, 'Medium')
    .otherwise('Hard')
)

In [15]:
# Prepare feature vector for Factorization Machine
feature_cols = ['grade_idx', 'domain_idx', 'subdomain_idx']
feature_assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

# Transform feedback data with feature vector
feedback_features = feature_assembler.transform(feedback_indexed)

In [16]:
from pyspark.ml.regression import FMRegressor

# Train Factorization Machine model
fm = FMRegressor(
    featuresCol="features",
    labelCol="performance",
    factorSize=8,  # Size of the latent factors
    maxIter=100,
    stepSize=0.1
)

fm_model = fm.fit(feedback_features)

In [18]:
# Function to get recommendations for a student
def get_recommendations(student_id, problems_df, fm_model, feature_assembler, n_recommendations=5):
    """
    Get personalized problem recommendations for a student
    """
    # Create all possible combinations for the student
    unique_combinations = problems_df.select('grade_idx', 'domain_idx', 'subdomain_idx', 
                                           'grade', 'domain', 'subdomain', 
                                           'difficulty_label').distinct()
    
    # Prepare features for prediction
    prediction_features = feature_assembler.transform(unique_combinations)
    
    # Make predictions
    predictions = fm_model.transform(prediction_features)
    
    # Sort by prediction (ascending) and difficulty
    recommendations = predictions.orderBy(
        'prediction',  # Focus on areas needing improvement
        'difficulty_label'  # Start with easier problems
    ).limit(n_recommendations)
    
    return recommendations

In [19]:
# Get recommendations for student
recommendations = get_recommendations(39285, problems_final, fm_model, feature_assembler)
recommendations.show()

+---------+----------+-------------+-----+------+---------+----------------+---------------+-------------------+
|grade_idx|domain_idx|subdomain_idx|grade|domain|subdomain|difficulty_label|       features|         prediction|
+---------+----------+-------------+-----+------+---------+----------------+---------------+-------------------+
|     12.0|      19.0|          1.0|  HSG|   GPE|        B|            Easy|[12.0,19.0,1.0]|-0.3939001027243008|
|     12.0|      19.0|          1.0|  HSG|   GPE|        B|            Hard|[12.0,19.0,1.0]|-0.3939001027243008|
|     12.0|      19.0|          1.0|  HSG|   GPE|        B|          Medium|[12.0,19.0,1.0]|-0.3939001027243008|
|     13.0|      18.0|          1.0|  HSS|    CP|        B|            Easy|[13.0,18.0,1.0]|-0.3617500704726811|
|     13.0|      18.0|          1.0|  HSS|    CP|        B|            Hard|[13.0,18.0,1.0]|-0.3617500704726811|
+---------+----------+-------------+-----+------+---------+----------------+---------------+----

In [31]:
# Function to generate learning path
def generate_learning_path(student_id, grade, domain, subdomain, problems_df, fm_model, feature_assembler):
    """
    Generate a progressive learning path for specific grade/domain/subdomain
    """
    # Get relevant problems
    category_problems = problems_df.filter(
        (col('grade') == grade) & 
        (col('domain') == domain) & 
        (col('subdomain') == subdomain)
    )
    
    # Get predicted performance
    features = feature_assembler.transform(
        category_problems.select('grade_idx', 'domain_idx', 'subdomain_idx')
    )
    predictions = fm_model.transform(features)
    
    # Join predictions with problems
    learning_path = predictions.join(category_problems)
    
    # Sort by difficulty
    learning_path = learning_path.orderBy('difficulty_label')
    
    return learning_path

In [32]:
# Generate learning path for a specific category
learning_path = generate_learning_path(39285, 7, 'RP', 'A', problems_final, fm_model, feature_assembler)
learning_path.sort(col('difficulty_label').desc()).show(20)

+---------+----------+-------------+-------------+-----------------+----------+--------------------+--------------------+-------------------+-------------------+-----+------+---------+---------+----------+-------------+--------------------+----------+----------------+
|grade_idx|domain_idx|subdomain_idx|     features|       prediction|problem_id|        problem_type|student_answer_count|       mean_correct|  mean_time_on_task|grade|domain|subdomain|grade_idx|domain_idx|subdomain_idx| difficulty_features|prediction|difficulty_label|
+---------+----------+-------------+-------------+-----------------+----------+--------------------+--------------------+-------------------+-------------------+-----+------+---------+---------+----------+-------------+--------------------+----------+----------------+
|      7.0|       9.0|          0.0|[7.0,9.0,0.0]|0.464795070598812|   1506753|              Number|                 314| 0.3258064516129032| 0.3290909560129651|    7|    RP|        A|      7.0