In [None]:
import pandas as pd
import numpy as np

In [None]:
%reload_ext autoreload
%autoreload 2

import sys
 
# setting path
sys.path.append('../server')

import progress
from progsnap import ProgSnap2Dataset
from progsnap import PS2
from progsnap import EventType

In [None]:
submit_columns = [EventType.Submit, EventType.RunProgram, 'Project.Submit']
problem_id = 13
problem_id_column = PS2.ProblemID
code_column = PS2.Code
data_folder = "data/cwo-f19/"
data_out_folder = "../server/data/CWO/"


In [None]:
data = ProgSnap2Dataset(data_folder)
main_table = data.get_main_table()
main_table

In [None]:
code_states = data.get_code_states_table()
code_states

In [None]:
# Should probably also be submit... but this is ok too
submissions = main_table[main_table[PS2.EventType].isin(submit_columns)]
submissions.groupby(problem_id_column).Score.mean()

In [None]:
assignment_submissions = submissions[submissions[problem_id_column] == problem_id]

In [None]:
assignment_code = pd.merge(assignment_submissions, code_states, on=PS2.CodeStateID)[[problem_id_column, PS2.Score, code_column]]

In [None]:
df = assignment_code.copy()
df["Code"] = df[code_column]
df["Correct"] = df["Score"] == 1
df = df[~df["Code"].isna()]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
X = df.Code
y = df.Correct

from sklearn.model_selection import train_test_split
X_train_code, X_test_code, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

#vectorizer = TfidfVectorizer(lowercase=False, token_pattern="[\w]+|[^\s]|[ ]{4}")
vectorizer = CountVectorizer(lowercase=False, token_pattern="[\w]+|[^\s]|[ ]{4}", ngram_range=(1,3))
vectorizer.fit(X_train_code)
X_train = vectorizer.transform(X_train_code)
X_test = vectorizer.transform(X_test_code)

X_train.shape

In [None]:
feature_names = vectorizer.get_feature_names_out()
feature_names[0:50]

In [None]:
np.mean(y)

In [None]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

np.mean(y_resampled)

In [None]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier, cv

# clf = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1),
                        # n_estimators=20, random_state=0).fit(X_resampled, y_resampled)

# clf = SVC().fit(X_resampled, y_resampled)

clf = XGBClassifier().fit(X_resampled, y_resampled)

# clf = Pipeline([
#     ('scale', StandardScaler(with_mean=False)),
#     ('logistic', LogisticRegressionCV(cv=5, random_state=1234, max_iter=1000))
# ]).fit(X_resampled, y_resampled)


In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Training performance (without oversampling)
pred_train = clf.predict(X_train)

print(classification_report(y_train, pred_train))

confusion_matrix(y_train, pred_train)

In [None]:
pred_test = clf.predict(X_test)

print(classification_report(y_test, pred_test))

confusion_matrix(y_test, pred_test)

In [None]:
import os
if not os.path.exists(data_out_folder):
   os.makedirs(data_out_folder)

In [None]:
import pickle
from sklearn.pipeline import Pipeline

pipe = Pipeline([('vectorizer', vectorizer), ('classifier', clf)])

with open(f'{data_out_folder}model-{problem_id}.pkl', 'wb') as f:
    pickle.dump(pipe, f)

In [None]:
print(X_train_code[y_train].reset_index().Code.iloc[0])

In [None]:
import seaborn as sns
# Of all correct submissions, plot the distribution of proportion of submissions where each feature appears at least once
sns.kdeplot((X_train[y_train].toarray() > 0).mean(axis=0))

# We want to see a bimodal distribution here, with most features either appearing
# << 0.5 or ~1.0. If not, we may need to adjust the parameter on the ProgressEstimator

In [None]:
import progress


estimator = progress.ProgressEstimator().fit(X_train[y_train])

# Plot the progress of all submissions, regardless of their correctness
# We expect to see most near 1, a none near 0
sns.histplot(estimator.predict_proba(X_test))


In [None]:
pipe = Pipeline([('vectorizer', vectorizer), ('classifier', estimator)])

with open(f'{data_out_folder}/progress-{problem_id}.pkl', 'wb') as f:
    pickle.dump(pipe, f)

In [None]:
unique_csids = main_table[main_table[problem_id_column] == problem_id][PS2.CodeStateID].unique()
all_code = code_states[code_states[PS2.CodeStateID].isin(unique_csids)][code_column]
print(all_code.iloc[9])

In [None]:
# X_edits_code = all_edits.Contents.str.decode('UTF-8').apply(strip_comments)
X_edits = vectorizer.transform(all_code)

In [None]:
X_edits

In [None]:
print(estimator.min_score)
print(estimator.max_score)

In [None]:
# X_edits_code = all_code.Contents.str.decode('UTF-8').apply(strip_comments)
sns.histplot(estimator.predict_proba(X_edits))