In [None]:
import pandas as pd
import sqlite3
import numpy as np

In [None]:
# assignmentID = 100
assignmentID = 139

query = f"""
select SubjectID, AssignmentID, CodeStateID, InterventionType, InterventionMessage, Score, Contents from (
select SubjectID, AssignmentID, CodeStateID, InterventionType, InterventionMessage, Score, CodeStateSection from MainTable where AssignmentId = "{assignmentID}" and InterventionCategory = "Feedback"
) as main JOIN CodeState where main.CodeStateID = CodeState.ID and main.CodeStateSection = CodeState.Filename
"""

con = sqlite3.connect("data/progsnap2_21_consenting_no_demographics.db")
df = pd.read_sql_query(query, con)

df["Correct"] = df.InterventionType == "complete|Complete"

df

In [None]:
np.mean(df.Correct)

In [None]:
# There is a Score column, but the problem is it's usually only the final submitted code
# after students have gotten lots of automated feedback from the system.
# The above method is less PS2-y but gives us more midway attempts that are incorrect.
# Not necessarily an issue, since Snap has the same issue and works mostly fine. 
df.Score.astype(float, errors='ignore').describe()

In [None]:
import sys, token, tokenize, io

# Credit: https://gist.github.com/BroHui/aca2b8e6e6bdf3cb4af4b246c9837fa3
def strip_comments(source):
    
    if (source is None or len(source.strip()) == 0):
        return ""

    prev_toktype = token.INDENT
    first_line = None
    last_lineno = -1
    last_col = 0

    mod = ""
    
    tokgen = tokenize.generate_tokens(io.StringIO(source).readline)
    try:
        for toktype, ttext, (slineno, scol), (elineno, ecol), ltext in tokgen:
            if 0:   # Change to if 1 to see the tokens fly by.
                print("%10s %-14s %-20r %r" % (
                    tokenize.tok_name.get(toktype, toktype),
                    "%d.%d-%d.%d" % (slineno, scol, elineno, ecol),
                    ttext, ltext
                    ))
            if slineno > last_lineno:
                last_col = 0
            if scol > last_col:
                mod += (" " * (scol - last_col))
            if toktype == token.STRING and (prev_toktype == token.INDENT or prev_toktype == token.NEWLINE):
                # Docstring
                mod += ("#--")
            elif toktype == tokenize.COMMENT:
                # Comment
                mod += ("##\n")
            else:
                mod += (ttext)
            prev_toktype = toktype
            last_col = ecol
            last_lineno = elineno
    except:
        # Parse failure ==> Return original
        return source
    
    return mod
        
print(strip_comments(df.Contents[0].decode('UTF-8')))

In [None]:
df["Code"] = df.Contents.str.decode('UTF-8').apply(strip_comments)
print(df.Code[1])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
X = df.Code
y = df.Correct

from sklearn.model_selection import train_test_split
X_train_code, X_test_code, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

#vectorizer = TfidfVectorizer(lowercase=False, token_pattern="[\w]+|[^\s]|[ ]{4}")
vectorizer = CountVectorizer(lowercase=False, token_pattern="[\w]+|[^\s]|[ ]{4}", ngram_range=(1,3))
vectorizer.fit(X_train_code)
X_train = vectorizer.transform(X_train_code)
X_test = vectorizer.transform(X_test_code)

X_train.shape

In [None]:
feature_names = vectorizer.get_feature_names_out()
feature_names[0:50]

In [None]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

np.mean(y_resampled)

In [None]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier, cv

# clf = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1),
                        # n_estimators=20, random_state=0).fit(X_resampled, y_resampled)

# clf = SVC().fit(X_resampled, y_resampled)

clf = XGBClassifier().fit(X_resampled, y_resampled)

# clf = Pipeline([
#     ('scale', StandardScaler(with_mean=False)),
#     ('logistic', LogisticRegressionCV(cv=5, random_state=1234, max_iter=1000))
# ]).fit(X_resampled, y_resampled)


In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Training performance (without oversampling)
pred_train = clf.predict(X_train)

print(classification_report(y_train, pred_train))

confusion_matrix(y_train, pred_train)

In [None]:
pred_test = clf.predict(X_test)

print(classification_report(y_test, pred_test))

confusion_matrix(y_test, pred_test)

In [None]:
import pickle
from sklearn.pipeline import Pipeline

pipe = Pipeline([('vectorizer', vectorizer), ('classifier', clf)])

with open(f'../server/data/BlockPy/model-{assignmentID}.pkl', 'wb') as f:
    pickle.dump(pipe, f)

In [None]:
print(X_train_code[y_train].reset_index().Code.iloc[0])

In [None]:
def is_odd(a_number: int) -> bool:
    #--
    return a_number % 2 == 1

def maximum_odd(odd: list) -> int:
    max = odd[0]
    for num in odd:
        if is_odd(num):
            if num > max:
                max = num
    return max

maximum_odd([1, 2, 7, 8, 9, 3, 4, 5, 6])



In [None]:
code = """def is_odd(a_number: int) -> bool:
    #--
    return a_number % 2 == 1
assert_equal(is_odd(31), True)
assert_equal(is_odd(22), False)
assert_equal(is_odd(4312), False)

def maximum_odd(numbers: [int]) -> int: 
    max_odd = 0
    for number in numbers: 
        if number > max_odd and is_odd(number): 
            max_odd = number
    return max_odd
assert_equal(maximum_odd([2, 3, 1, 43, 1, 0]), 43)
assert_equal(maximum_odd([3, 3, 1, 46, 90, 0]), 3)
assert_equal(maximum_odd([2, 18, 90, 2, 40, 67]), 67)
"""

pipe.predict_proba([code])

In [None]:
# import seaborn as sns

# probs = np.mean(np.array([est.predict_proba(X_test)[:,1] for est in clf.estimators_]), axis = 0)
# sns.histplot(probs)

In [None]:
def unweighted_prediction(clf, X):
    preds = np.array([est.predict(X) for est in clf.estimators_])
    probs = np.mean(preds, axis = 0)
    pmax = np.max(probs)
    pmin = np.min(probs)
    return (probs - pmin) / (pmax - pmin)

# sns.histplot(unweighted_prediction(clf, X_test))

In [None]:
sns.kdeplot((X_train[y_train].toarray() > 0).mean(axis=0))

In [None]:
%reload_ext autoreload
%autoreload 2

import sys
 
# setting path
sys.path.append('../server')

import progress

estimator = progress.ProgressEstimator().fit(X_train[y_train])

sns.histplot(estimator.predict_proba(X_test))


In [None]:
pipe = Pipeline([('vectorizer', vectorizer), ('classifier', estimator)])

with open(f'../server/data/BlockPy/progress-{assignmentID}.pkl', 'wb') as f:
    pickle.dump(pipe, f)

In [None]:
query = f"""
select SubjectID, AssignmentID, CodeStateID, Contents from (
select SubjectID, AssignmentID, CodeStateID, CodeStateSection from MainTable where AssignmentId = "{assignmentID}" and EventType="File.Edit"
) as main JOIN CodeState where main.CodeStateID = CodeState.ID and main.CodeStateSection = CodeState.Filename
"""

con = sqlite3.connect("data/progsnap2_21_consenting_no_demographics.db")
all_edits = pd.read_sql_query(query, con)

all_edits

In [None]:
X_edits_code = all_edits.Contents.str.decode('UTF-8').apply(strip_comments)
X_edits = vectorizer.transform(X_edits_code)

In [None]:
X_edits

In [None]:
print(estimator.min_score)
print(estimator.max_score)

In [None]:
sns.histplot(estimator.predict_proba(X_edits))

In [None]:
sample = pd.DataFrame({
    'Code': X_test_code,
    'Progress_score': progress_score(X_test),
    'Correctness_score': clf.predict_proba(X_test)[:,1],
    'Correct': y_test
})
sample.to_csv(f'data/out/p{assignmentID}.csv', index=False)
sample

In [None]:
sample = pd.DataFrame({
    'Code': X_edits_code,
    'Progress_score': progress_score(X_edits),
    'Correctness_score': clf.predict_proba(X_edits)[:,1]
})
sample.to_csv(f'data/out/p{assignmentID}-edits.csv', index=False)
sample

In [None]:
from sklearn.tree import plot_tree

def print_rule(clf, index):
    estimator = clf.estimators_[index]
#     name = feature_names[estimator.tree_.feature[0]]
#     thresh = estimator.tree_.threshold[0]
#     estimator_samples = clf.estimators_samples_[index]
#     children = estimator.apply(X_resampled[estimator_samples])
#     perc_child_1 = np.mean(y_resampled[estimator_samples][children == 1])
#     perc_child_2 = np.mean(y_resampled[estimator_samples][children == 2])
#     pred_child_1 = perc_child_1 > 0.5
#     pred_child_2 = perc_child_2 > 0.5
#     if pred_child_1 and pred_child_2:
#         print(f"Degenerate rule: always {pred_child_1}")
#         return
#     if pred_child_1:
#         print (f"If {name} < {thresh}, True")
#     else:
#         print (f"If {name} > {thresh}, True")
    plot_tree(estimator)
    
print_rule(clf, 8)

In [None]:
[feature_names[est.tree_.feature[0]] for est in clf.estimators_]