### Imports, Config and Setup

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import requests

In [None]:
%reload_ext autoreload
%autoreload 2

import sys

# setting path
sys.path.append('../')

from shared.progress import ProgressEstimator
from shared.progsnap import ProgSnap2Dataset, PS2, EventType
from shared.database import CSVDataProvider, SQLiteDataProvider
from shared.data import SQLiteLogger

In [None]:

from configs import config_PCRS, config_iSnap, config_CWO

# Assign variable directly so Pylance doesn't get upset
submit_columns = None
test_problem_id = None
problem_id_column = None
code_column = None
data_folder = None
database = None

# Chose the config you want to use
locals().update(config_PCRS)

# Set the problem_id to something specific, or use the default test problem
problem_id = test_problem_id



In [None]:
# List problem IDs, in case you want to change to a different one
dataset = ProgSnap2Dataset(CSVDataProvider(data_folder))
dataset.get_main_table()[problem_id_column].unique()

### Build the Models and Save to the DB

Load a ProgSnap2 dataset for one problem, build the progress and classifier models, and save them to the server's SQLite database for use later.

In [None]:
from shared.preprocess import SimpleAIFBuilder

builder = SimpleAIFBuilder(
    problem_id,
    code_column=code_column,
    problem_id_column=problem_id_column
)
builder.build(dataset)

In [None]:
from shared.data import SQLiteLogger

logger = SQLiteLogger(database)
logger.create_tables()


In [None]:
progress_model = builder.get_trained_progress_model()
classifier = builder.get_trained_classifier()
correct_count = int(builder.X_train[builder.y_train].unique().size)
logger.set_models(problem_id, progress_model, classifier, correct_count)

### Further explore the model's outputs

In [None]:
# Average number of attempts that got this problem correct
builder.mean_scores[problem_id]

In [None]:
# The first 50 n-gram features for this problem
builder.get_feature_names()[:50]

In [None]:
# The classifier model's training performance
report, cm = builder.get_training_report()
print(report)
cm

In [None]:
# The classifier model's CV testing performance
report, cm = builder.get_cv_report()
print(report)
cm

In [None]:
# One example of a correct submission for this problem
print(builder.get_correct_submissions()[0])

In [None]:
# Of all correct submissions, plot the distribution of the number of times each feature appears at least once
sns.kdeplot((builder.get_vectorized_submissions()[builder.y_train].toarray() > 0).mean(axis=0))

In [None]:
# The starter code for this problem
print(builder.get_starter_code())

In [None]:
# Plot the progress of all submissions, regardless of their correctness
sns.histplot(builder.get_trained_progress_model().predict_proba(builder.X_train))

### Quickly add the Dataset to the SQLite Database
This prepopulates the server's SQLite database with all student data for the current problem, in case we want to update the models dynamically as the server is running.

In [None]:
logger.clear_table("MainTable")
logger.clear_table("CodeStates")
logger.clear_table("Models")

In [None]:
submissions_table = SimpleAIFBuilder.get_submissions_table(dataset)
submissions_table = submissions_table[submissions_table[problem_id_column] == problem_id]
code_table = dataset.get_code_states_table()
submissions_table = pd.merge(submissions_table, code_table, on=PS2.CodeStateID)
submissions_table.rename(columns={code_column: "CodeState"}, inplace=True)

We can add the data to the SQLite database directly using the logger.

In [None]:
for index, row in submissions_table.iterrows():
    row_dict = row.to_dict()
    del row_dict["CodeStateID"]
    del row_dict["Order"]
    event_type = row[PS2.EventType]
    if event_type not in ["Submit", "FileEdit", "Run.Program"]:
        continue
    logger.log_event(event_type, row_dict)

### Test populating a Dataset using the SimpleAIF Server

We can also add the data to the SQLite database using the server endpoint (if it's running) to test that functionality. This code is redundant with the above.

In [None]:
limit = 1
limit = 100000

for index, row in submissions_table.iterrows():
    row_dict = row.to_dict()
    del row_dict["CodeStateID"]
    del row_dict["Order"]
    del row_dict["ParentEventID"]
    event_type = row[PS2.EventType]
    if event_type not in ["Submit", "FileEdit", "Run.Program"]:
        print(event_type)
        continue
    # Remove nan values from the dict
    row_dict = {k: v for k, v in row_dict.items() if not pd.isnull(v)}
    row_dict["ShouldLog"] = True
    url = f"http://127.0.0.1:5000/{row[PS2.EventType]}/"
    x = requests.post(url, json = row_dict)

    limit -= 1
    if limit == 0:
        break

We can also test uploading starter code via the server

In [None]:
limit = 10000

starter_code = dataset.load_link_table("Problem")
for index, row in starter_code.iterrows():
    row_dict = row.to_dict()
    url = f"http://127.0.0.1:5000/X-SetStarterCode/"
    x = requests.post(url, json = row_dict)

    limit -= 1
    if limit == 0:
        break

### Test building the models directly from the SQLite database
If the server updates dynamically, it will log new submissions to the SQLite database and regularly rebuild the models. This code tests that functionality.

In [None]:
dataset_sql = ProgSnap2Dataset(SQLiteDataProvider(database))

builder_sql = SimpleAIFBuilder(
    str(problem_id),
    code_column=code_column,
    problem_id_column=problem_id_column
)
builder_sql.build(dataset_sql)

In [None]:
report, cm = builder_sql.get_cv_report()
print(report)
cm

In [None]:
SimpleAIFBuilder.get_submissions_table(dataset_sql)

In [None]:
dataset_sql.get_code_states_table()["CodeStateID"] == 2

In [None]:
builder_sql.mean_scores

In [None]:
builder.get_starter_code()