In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
%reload_ext autoreload
%autoreload 2

import sys
 
# setting path
sys.path.append('../shared')

from progress import ProgressEstimator
from progsnap import ProgSnap2Dataset
from progsnap import PS2
from progsnap import EventType
from database import CSVDataProvider

In [None]:
submit_columns = [EventType.Submit, EventType.RunProgram, 'Project.Submit']
problem_id = 8
problem_id_column = PS2.ProblemID
code_column = PS2.Code
data_folder = "data/pcrs-f18/"
data_out_folder = "../server/data/PCRS/"

In [None]:
from preprocess import SimpleAIFBuilder

dataset = ProgSnap2Dataset(CSVDataProvider(data_folder))

builder = SimpleAIFBuilder(
    problem_id,
    code_column=code_column, 
    problem_id_column=problem_id_column
)
builder.build(dataset)

In [None]:
import pickle

with open(f'{data_out_folder}model-{problem_id}.pkl', 'wb') as f:
    pickle.dump(builder.get_trained_classifier(), f)

In [None]:
estimator = builder.get_trained_progress_model()
with open(f'{data_out_folder}/progress-{problem_id}.pkl', 'wb') as f:
    pickle.dump(estimator, f)

In [None]:
builder.mean_scores.index

In [None]:
builder.get_feature_names()[:50]

In [None]:
report, cm = builder.get_training_report()
print(report)
cm

In [None]:
report, cm = builder.get_cv_report()
print(report)
cm

In [None]:
print(builder.get_correct_submissions()[0])

In [None]:
# Of all correct submissions, plot the distribution of the number of times each feature appears at least once
sns.kdeplot((builder.get_vectorized_submissions()[builder.y_train].toarray() > 0).mean(axis=0))

In [None]:
builder.get_starter_code()

In [None]:
# Plot the progress of all submissions, regardless of their correctness
sns.histplot(estimator.predict_proba(builder.X_train))