In [9]:
# For imports
import pandas as pd
from collections import defaultdict
import statistics
import json

In [10]:
def get_fractional_values_in_range(end):
    '''
    Returns a list of lists of fractional values in the range [1, end]
    '''
    all_store_values = []
    for i in range(1, end+1):
        store_values = []
        for j in range(1, i+1):
            store_values.append(j/i)
        all_store_values.append(store_values)
    return all_store_values

def check_if_present(all_store_values, value, q=None):
    '''
    Searches for the integers p, q such that p/q = value
    Returns q-1, p-1 if found
    Else returns -1, -1
    '''
    # check if value is similar to any value in all_store_values
    # if true return its position in the original unflatten list
    # else return -1
    best_pair = -1, -1
    if q == None:
        for i, store_values in enumerate(all_store_values):
            for j, store_value in enumerate(store_values):
                if abs(store_value - value) < 0.000001:
                    return i, j
    else:
        least_diff = 1
        store_values = all_store_values[q-1]
        for j, store_value in enumerate(store_values):
            if abs(store_value - value) < least_diff:
                best_pair = q-1, j
                least_diff = abs(store_value - value)
    return best_pair

In [11]:
all_store_values = get_fractional_values_in_range(30)
i, j = check_if_present(all_store_values, 0.384615)
print(j+1, i+1)

5 13


# Sanity Check with Main Tables Data

In [12]:
# load maintable
main_df = pd.read_csv('../../S19_All_Release_2_10_22/Data/MainTable.csv')
main_df.head()

Unnamed: 0,Order,SubjectID,ToolInstances,ServerTimestamp,ServerTimezone,CourseID,CourseSectionID,AssignmentID,ProblemID,CodeStateID,IsEventOrderingConsistent,EventType,Score,Compile.Result,CompileMessageType,CompileMessageData,EventID,ParentEventID,SourceLocation
0,119630,00c54f9462673d4c09d2a88121860841,Java 8; CodeWorkout,2019-02-24T05:13:03,UTC,CS 1,1,439.0,1,efdf3fae54cdc0a09fb81fcfa365c5f843e837b0,True,Run.Program,1.0,,,,1-69176,,
1,119631,00c54f9462673d4c09d2a88121860841,Java 8; CodeWorkout,2019-02-24T05:13:03,UTC,CS 1,1,439.0,1,efdf3fae54cdc0a09fb81fcfa365c5f843e837b0,True,Compile,,Success,,,1-69177,1-69176,
2,134330,00c54f9462673d4c09d2a88121860841,Java 8; CodeWorkout,2019-02-24T05:13:33,0,CS 1,1,439.0,3,79b2bcc8f502e1f7d2d4e83d1894964684c89b85,True,Run.Program,1.0,,,,3-68089,,
3,134331,00c54f9462673d4c09d2a88121860841,Java 8; CodeWorkout,2019-02-24T05:13:33,0,CS 1,1,439.0,3,79b2bcc8f502e1f7d2d4e83d1894964684c89b85,True,Compile,,Success,,,3-68090,3-68089,
4,65666,00c54f9462673d4c09d2a88121860841,Java 8; CodeWorkout,2019-02-24T05:09:54,UTC,CS 1,1,439.0,5,2697d299e7c5992f557ce38d8a04345112a11af8,True,Run.Program,0.75,,,,5-35681,,


In [13]:
# consider only the scores whose value is less than 1 and not 0
main_df = main_df[main_df['Score'] < 1]
main_df = main_df[main_df['Score'] != 0]
main_df.head()

Unnamed: 0,Order,SubjectID,ToolInstances,ServerTimestamp,ServerTimezone,CourseID,CourseSectionID,AssignmentID,ProblemID,CodeStateID,IsEventOrderingConsistent,EventType,Score,Compile.Result,CompileMessageType,CompileMessageData,EventID,ParentEventID,SourceLocation
4,65666,00c54f9462673d4c09d2a88121860841,Java 8; CodeWorkout,2019-02-24T05:09:54,UTC,CS 1,1,439.0,5,2697d299e7c5992f557ce38d8a04345112a11af8,True,Run.Program,0.75,,,,5-35681,,
37,41623,00c54f9462673d4c09d2a88121860841,Java 8; CodeWorkout,2019-02-24T04:41:16,UTC,CS 1,1,439.0,232,74ced382f91992f142909208af8fea04c73b13e6,True,Run.Program,0.857143,,,,232-51530,,
39,41625,00c54f9462673d4c09d2a88121860841,Java 8; CodeWorkout,2019-02-24T04:42:19,UTC,CS 1,1,439.0,232,043137af52f2eb9ff6dcd90f2ab5d8003d16a987,True,Run.Program,0.5,,,,232-51532,,
47,41633,00c54f9462673d4c09d2a88121860841,Java 8; CodeWorkout,2019-02-24T04:44:19,UTC,CS 1,1,439.0,232,be36bb42dd7cb33a283a23a880f099c381134bc5,True,Run.Program,0.357143,,,,232-51540,,
49,41635,00c54f9462673d4c09d2a88121860841,Java 8; CodeWorkout,2019-02-24T04:44:51,UTC,CS 1,1,439.0,232,70eb6c5805a4b42e94e41c79318a972f00935e95,True,Run.Program,0.357143,,,,232-51542,,


In [14]:
# Sanity check for the presence of all scores
potential_scores = main_df['Score'].unique()
ctr = 0
for score in potential_scores:
    i, j = check_if_present(all_store_values, score)
    if i == -1:
        print(score)
        ctr += 1
print('Not matched: {:d}/{:d}', ctr, len(potential_scores))

Not matched: {:d}/{:d} 0 203


# Analyzing Group wise statistics

In [15]:
def get_best_q(unique_q):
    '''
    Returns the largest q value having the most number of divisors within unique_q
    '''
    # sort unique_q in ascending order
    unique_q.sort()
    max_divisors = 0
    best_q = unique_q[-1]
    for j in range(len(unique_q)-1, -1, -1):
        num_divisors = 0
        for i in range(0, j):
            if unique_q[j] % unique_q[i] == 0:
                num_divisors += 1
        if num_divisors > max_divisors:
            max_divisors = num_divisors
            best_q = unique_q[j]
        else:
            break
    return best_q

In [16]:
# reload main_df
main_df = pd.read_csv('../../S19_All_Release_2_10_22/Data/MainTable.csv')
# filter all nan values of the score column
main_df = main_df[main_df['Score'].notna()]

group_id_score = defaultdict(list)
for i, row in main_df.iterrows():
    group_id_score[(int(row['AssignmentID']), row['ProblemID'])].append(row['Score'])

# remove duplicate scores for each group
for group, score in group_id_score.items():
    group_id_score[group] = set(score)

# count total number of test cases for every group
group_wise_test_cases = defaultdict(dict)
for group, score in group_id_score.items():
    all_q, all_p = [], []
    for score in score:
        i, j = check_if_present(all_store_values, score) # i total number of test cases, j number of test cases passed
        if i == -1:
            continue
        all_q.append(i+1)
        all_p.append(j+1)
    unique_q = list(set(all_q))
    unique_p = list(set(all_p))
    # if len(unique_test_cases) > 1:
    #     print(group, unique_test_cases)
    # choose the max of the numbers of estimated test cases
    str_group = str(group)
    group_wise_test_cases[str_group]['Best q'] = get_best_q(unique_q)
    group_wise_test_cases[str_group]['Unique q'] = unique_q
    group_wise_test_cases[str_group]['Unique p'] = unique_p

In [17]:
# Statistics on the number of test cases
with open('q_statistics.json', 'w') as f:
    json.dump(group_wise_test_cases, f, indent=6)

print('Statistics on the number of test cases')
print('Length: ', len(group_wise_test_cases))
print('Maximum: ', max([v['Best q'] for v in group_wise_test_cases.values()]))
print('Minimum: ', min([v['Best q'] for v in group_wise_test_cases.values()]))
print('Mean: ', statistics.mean([v['Best q'] for v in group_wise_test_cases.values()]))

Statistics on the number of test cases
Length:  50
Maximum:  28
Minimum:  6
Mean:  15.78


In [18]:
# Sanity check on correct number of test cases q
problem_groups = []
for group, v in group_wise_test_cases.items():
    unique_q = v['Unique q']
    best_q = v['Best q']
    for q in unique_q:
        if best_q % q != 0:
            problem_groups.append((group, unique_q, best_q))
            break
print('Problem groups: ', problem_groups)
print('Number of problem groups: ', len(problem_groups))

Problem groups:  [('(439, 12)', [1, 2, 4, 5, 10, 11, 20], 20), ('(439, 232)', [1, 2, 5, 7, 14], 14), ('(487, 24)', [1, 2, 3, 11, 22], 22), ('(492, 34)', [1, 2, 7, 13, 14], 14), ('(492, 38)', [1, 3, 5, 7, 11, 12, 15], 15), ('(492, 39)', [1, 6, 9, 18, 19], 18), ('(492, 128)', [1, 2, 3, 6, 8, 9, 11, 13, 16, 18], 18), ('(494, 43)', [1, 2, 4, 7, 8], 8), ('(494, 49)', [1, 12, 17], 17), ('(494, 107)', [1, 10, 11], 11), ('(494, 108)', [1, 9, 19], 19)]
Number of problem groups:  11


# Collect Working Dataset

* Consider all data except syntax errors 
* Update the scores based on the new scores (p/q) ratio
* Add new information columns on p/q

In [19]:
# reload main_df
main_df = pd.read_csv('../../S19_All_Release_2_10_22/Data/MainTable.csv')
# filter all nan values of the score column
main_df = main_df[main_df['Score'].notna()]
print('Length of main_df: ', len(main_df))
main_df.head()

Length of main_df:  69627


Unnamed: 0,Order,SubjectID,ToolInstances,ServerTimestamp,ServerTimezone,CourseID,CourseSectionID,AssignmentID,ProblemID,CodeStateID,IsEventOrderingConsistent,EventType,Score,Compile.Result,CompileMessageType,CompileMessageData,EventID,ParentEventID,SourceLocation
0,119630,00c54f9462673d4c09d2a88121860841,Java 8; CodeWorkout,2019-02-24T05:13:03,UTC,CS 1,1,439.0,1,efdf3fae54cdc0a09fb81fcfa365c5f843e837b0,True,Run.Program,1.0,,,,1-69176,,
2,134330,00c54f9462673d4c09d2a88121860841,Java 8; CodeWorkout,2019-02-24T05:13:33,0,CS 1,1,439.0,3,79b2bcc8f502e1f7d2d4e83d1894964684c89b85,True,Run.Program,1.0,,,,3-68089,,
4,65666,00c54f9462673d4c09d2a88121860841,Java 8; CodeWorkout,2019-02-24T05:09:54,UTC,CS 1,1,439.0,5,2697d299e7c5992f557ce38d8a04345112a11af8,True,Run.Program,0.75,,,,5-35681,,
6,65668,00c54f9462673d4c09d2a88121860841,Java 8; CodeWorkout,2019-02-24T05:10:31,UTC,CS 1,1,439.0,5,c86e57300992f1b39c415a0917a34740fee933e0,True,Run.Program,1.0,,,,5-35683,,
8,37219,00c54f9462673d4c09d2a88121860841,Java 8; CodeWorkout,2019-02-24T05:14:37,0,CS 1,1,439.0,12,6e21cdd2b4a63610bd2a3e882ee772968ef82b3c,True,Run.Program,1.0,,,,12-35708,,


In [20]:
# Get p, q for each data point
all_p, all_q, all_scores = [], [], []
# store meta-statistics
unique_p_values = defaultdict(list)
q_values = dict()
num_examples = defaultdict(int)
for i, row in main_df.iterrows():
    group = str((int(row['AssignmentID']), row['ProblemID']))
    best_q = group_wise_test_cases[group]['Best q']
    raw_q, raw_p = check_if_present(all_store_values, row['Score'], best_q)
    # sanity check
    assert raw_q+1 == best_q
    # store values
    all_p.append(raw_p+1)
    all_q.append(raw_q+1)
    # Updated score
    score = (raw_p+1)/(raw_q+1)
    all_scores.append(score)
    # store meta-statistics
    unique_p_values[group].append(raw_p+1)
    q_values[group] = raw_q+1
    num_examples[group] += 1

# store only unique p_values
for group, p_values in unique_p_values.items():
    unique_p_values[group] = list(set(p_values))
    # sort unique_p_values in ascending order
    unique_p_values[group].sort()

# Save Main_df 
main_df['p'] = all_p
main_df['q'] = all_q
main_df['Score'] = all_scores
# Remove columns - ParentEventID and SourceLocation
main_df = main_df.drop(columns=['ParentEventID', 'SourceLocation'])
# Save to csv
main_df.to_csv('working_data.csv', index=False)

# Save meta-statistics into csv file using pandas
meta_statistics = pd.DataFrame()
meta_statistics['Group'] = unique_p_values.keys()
meta_statistics['Unique p'] = unique_p_values.values()
meta_statistics['q'] = q_values.values()
meta_statistics['Number of examples'] = num_examples.values()
meta_statistics.to_csv('meta_statistics.csv', index=False)


In [21]:
# Implement Choose function
def choose(n, k):
    '''
    Returns the binomial coefficient n choose k
    '''
    if k == 0:
        return 1
    if k > n:
        return 0
    return choose(n-1, k-1) + choose(n-1, k)

# Sanity check on choose function
q = 10
choose(3*q, q)

30045015