# This is a sample Jupyter Notebook

Below is an example of a code cell. 
Put your cursor into the cell and press Shift+Enter to execute it and select the next one, or click !here goes the icon of the corresponding button in the gutter! button.
To debug a cell, press Alt+Shift+Enter, or click !here goes the icon of the corresponding button in the gutter! button.

Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.

To learn more about Jupyter Notebooks in PyCharm, see [help](https://www.jetbrains.com/help/pycharm/jupyter-notebook-support.html).
For an overview of PyCharm, go to Help -> Learn IDE features or refer to [our documentation](https://www.jetbrains.com/help/pycharm/getting-started.html).

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Initialize Spark session

spark = (SparkSession.builder.appName("DataProcessingApp")
         .config("spark.executor.memory", "12g")
         .config("spark.driver.memory", "12g")
         .getOrCreate())
spark.sparkContext.setLogLevel("DEBUG")

# Read the CSV file into a DataFrame
df = spark.read.csv('data/2019-2020_school_year/pdets.csv', header=True, inferSchema=True)

# Drop the specified columns
df = df.drop('content_source', 'tutoring_types')

# Describe the DataFrame and format the output
df.describe().show()

In [None]:
from pyspark.sql.types import IntegerType

# Filter rows where 'problem_id' is not null
df = df.filter(df["problem_id"].isNotNull()).filter(df.skills.isNotNull())

# Convert 'problem_id' to integer type
df = df.withColumn("problem_id", col("problem_id").cast(IntegerType()))

# Show the first few rows
df.show(10)

In [None]:
from pyspark.sql.functions import isnan, when, count
# df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]).show()
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

In [None]:
from ast import literal_eval
from pyspark.sql.functions import col, lit, when, explode, split
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, StringType

# Fill null values in 'skills' column with empty list
df = df.withColumn("skills", when(col("skills").isNull(), "[]").otherwise(col("skills")))
print(df.count(), '....')

# Define a UDF to split skills and replicate rows
def process_skills(skills):
    if skills == '[]':
        return [(None, None, None)]  # Return an empty list to avoid None values in explode
    else:
        return [(skill.split('.')[0], skill.split('.')[1], skill.split('.')[2]) for skill in literal_eval(skills)]

process_skills_udf = F.udf(process_skills, ArrayType(ArrayType(StringType())))

# Apply the UDF and explode the resulting array
df = df.withColumn("skills_array", process_skills_udf(col("skills")))
df = df.withColumn("skills_exploded", explode(col("skills_array")))

# Select and rename columns
new_df = df.select(
    col("problem_id"),
    col("skills"),
    col("problem_type"),
    col("student_answer_count"),
    col("mean_correct"),
    col("mean_time_on_task"),
    col("skills_exploded").getItem(0).alias("grade"),
    col("skills_exploded").getItem(1).alias("domain"),
    col("skills_exploded").getItem(2).alias("subdomain")
)

# Show the first few rows
new_df.show()

In [None]:
df.count()

One problem can belong to maximum 4 unique classes

In [None]:
plogs = spark.read.csv('data/2019-2020_school_year/plogs.csv', header=True, inferSchema=True)
print(plogs.count())
plogs = plogs.select('assignment_id', 'problem_id').distinct()
plogs.show(10)

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

print('Debugging start')
new_df_no_skills = new_df.where(new_df.subdomain.isNull())

# Step 1: Find all `assignment_id`s associated with each `problem_id` in `new_df_no_skills`.
plogs_problem = new_df_no_skills.join(plogs, on='problem_id', how='left')

# Step 2: For each `assignment_id` found, get related rows in `plogs`
plogs_assignment_related = plogs.join(plogs_problem, on='assignment_id', how='inner') \
    .select('assignment_id', 'problem_id', 'grade', 'domain', 'subdomain')

# Step 3: Calculate the most common `grade`, `domain`, and `subdomain` for each `problem_id`.
window_spec = Window.partitionBy('problem_id')

# Using `F.first` on mode-sorted columns to get the mode (most common) value
assignment_problems = plogs_assignment_related \
    .withColumn('mode_grade', F.first('grade').over(window_spec.orderBy(F.col('grade').desc()))) \
    .withColumn('mode_domain', F.first('domain').over(window_spec.orderBy(F.col('domain').desc()))) \
    .withColumn('mode_subdomain', F.first('subdomain').over(window_spec.orderBy(F.col('subdomain').desc()))) \
    .select('problem_id', 'mode_grade', 'mode_domain', 'mode_subdomain') \
    .distinct()

# Step 4: Join the mode calculations back to `new_df` to update columns
new_df = new_df.join(
    assignment_problems,
    on='problem_id',
    how='left'
).withColumn(
    'grade', F.coalesce(new_df['grade'], assignment_problems['mode_grade'])
).withColumn(
    'domain', F.coalesce(new_df['domain'], assignment_problems['mode_domain'])
).withColumn(
    'subdomain', F.coalesce(new_df['subdomain'], assignment_problems['mode_subdomain'])
)
new_df.show(10)

In [None]:
assignment_problems = None
for (idx, row) in new_df.iterrows():
    if row['skills'] == '[]':
        plogs_idx = plogs[plogs['problem_id'] == row['problem_id']]
        related_assignments = plogs_idx['assignment_id'].unique()
        plogs_assignment = plogs[plogs['assignment_id'].isin(related_assignments)]
        # all plogs related to all assignments which related to the problem
        assignment_problems = pd.merge(plogs_assignment, new_df, 'left', on='problem_id')

        try:
            # Ensure mode() result is not empty before accessing its first element
            impute_grade = assignment_problems['grade'].mode()
            impute_domain = assignment_problems['domain'].mode()
            impute_subdomain = assignment_problems['subdomain'].mode()

            if not impute_grade.empty:
                impute_grade = impute_grade[0]
            else:
                impute_grade = None  # or some other default value

            if not impute_domain.empty:
                impute_domain = impute_domain[0]
            else:
                impute_domain = None

            if not impute_subdomain.empty:
                impute_subdomain = impute_subdomain[0]
            else:
                impute_subdomain = None

        except Exception as e:
            print(f"Error processing row {row['problem_id']}: {e}")
            continue

        row_to_update = new_df[new_df['problem_id'] == row['problem_id']]
        new_df.loc[new_df['problem_id'] == row['problem_id'], ['grade', 'domain', 'subdomain']] = [impute_grade, impute_domain, impute_subdomain]

new_df.head(10000)