In [1]:
import pandas as pd # type: ignore

def load_CSEDM_data(filename: str) -> pd.DataFrame:
    """
    Load the CSEDM dataset from a pickle file.

    Args:
        filename (str): The path to the pickle file.

    Returns:
        pd.DataFrame: The loaded CSEDM dataset.
    """
    CSEDMfile = pd.read_pickle(filename)
    return CSEDMfile

filename = '../../CSEDM_dataset/largedataset.pkl'
CSEDMfile = load_CSEDM_data(filename)


In [2]:
CSEDMfile.head()

Unnamed: 0,SubjectID,AssignmentID,ProblemID,CodeStateID,Score_x,Code,Code-ast,code-astnn,code-embedding,Score_y,embedding,astnn,prompt,prompt-embedding,input
0,04c32d4d95425f73b3a1d6502aed4d48,439.0,1,4531059d41ba170482b4e43d4d94d857c0e45dbb,1.0,"public int sortaSum(int a, int b)\r\n{\r\n ...","MethodDeclaration(annotations=[], body=[IfStat...",MethodDeclaration Modifier public BasicType in...,"[[28, [26, [27]], [3, [4]], [248], [13, [3, [4...",2,"[0.53723663, 0.85902363, 0.7639073, 0.29109484...","[[MethodDeclaration, [Modifier, ['public']], [...",Write a function in Java that implements the f...,"[0.007758622, -0.02058616, 0.051114812, 0.0035...","[tensor(0.0078), tensor(-0.0206), tensor(0.051..."
1,04c32d4d95425f73b3a1d6502aed4d48,439.0,3,69089e4182ecddd4b48c39c86c8ae2edb337b07c,0.8125,"public boolean in1To10(int n, boolean outsideM...","MethodDeclaration(annotations=[], body=[IfStat...",MethodDeclaration Modifier public BasicType bo...,"[[28, [26, [27]], [3, [41]], [211], [13, [3, [...",1,"[0.4605768, 0.6272869, 0.73869926, 0.20308065,...","[[MethodDeclaration, [Modifier, ['public']], [...",Write a function in Java that implements the f...,"[0.0004377277, -0.025701463, 0.05109195, 0.003...","[tensor(0.0004), tensor(-0.0257), tensor(0.051..."
2,04c32d4d95425f73b3a1d6502aed4d48,439.0,3,d565ccacd2e63b9414077ff2b4888622e37b80c6,1.0,"public boolean in1To10(int n, boolean outsideM...","MethodDeclaration(annotations=[], body=[IfStat...",MethodDeclaration Modifier public BasicType bo...,"[[28, [26, [27]], [3, [41]], [211], [13, [3, [...",2,"[0.56166804, 0.39122367, 0.7709216, 0.18991716...","[[MethodDeclaration, [Modifier, ['public']], [...",Write a function in Java that implements the f...,"[0.0004377277, -0.025701463, 0.05109195, 0.003...","[tensor(0.0004), tensor(-0.0257), tensor(0.051..."
3,04c32d4d95425f73b3a1d6502aed4d48,439.0,5,807be3e1de41cfb8d37b0c172c0764707114b054,0.875,"public boolean answerCell(boolean isMorning, b...","MethodDeclaration(annotations=[], body=[IfStat...",MethodDeclaration Modifier public BasicType bo...,"[[28, [26, [27]], [3, [41]], [204], [13, [3, [...",1,"[0.3725777, 0.19787003, -0.035944805, 0.091237...","[[MethodDeclaration, [Modifier, ['public']], [...",Write a function in Java that implements the f...,"[0.019149723, -0.0023263248, 0.052457552, 0.01...","[tensor(0.0191), tensor(-0.0023), tensor(0.052..."
4,04c32d4d95425f73b3a1d6502aed4d48,439.0,5,dc0bbdcc7e469bd78f5a2d3ad5b0de6c8e831f8c,1.0,"public boolean answerCell(boolean isMorning, b...","MethodDeclaration(annotations=[], body=[IfStat...",MethodDeclaration Modifier public BasicType bo...,"[[28, [26, [27]], [3, [41]], [204], [13, [3, [...",2,"[0.5158825, 0.3725396, 0.5538663, 0.008917546,...","[[MethodDeclaration, [Modifier, ['public']], [...",Write a function in Java that implements the f...,"[0.019149723, -0.0023263248, 0.052457552, 0.01...","[tensor(0.0191), tensor(-0.0023), tensor(0.052..."


In [5]:
#create a function to read all the files which name start with slurm from '../../CSEDM_dataset/test_case_stat/'. Each of the file has the following format:
#slurm-<jobid>.out
#The file contains the following information:
#Ignore the first line
#the second line contains the problem id in the format: Target Problem <problem_id>
# each of the following line contains the following information:
# pid, sid, matches, cid, score, score_calc, okay_count, total_sub
# where pid is the problem id, sid is the student id, matches is the number of matches, cid is the code id, score is the score, score_calc is the calculated score, okay_count is the number of submissions where the score and score_calc matches perfectly, total_sub is the total number of submission
# The function should return a consolidated dataframe with all the columns from the file and an additional column called jobid which contains the jobid from the file name

def read_slurm_files(path: str) -> pd.DataFrame:
    """
    Read all the files which name start with slurm from the given path.

    Args:
        path (str): The path to the files.

    Returns:
        pd.DataFrame: The consolidated dataframe with all the columns from the file and an additional column called jobid which contains the jobid from the file name.
    """
    import os
    import re
    import pandas as pd
    import numpy as np

    # get all the files in the directory
    files = os.listdir(path)

    # filter the files that start with 'slurm'
    files = [file for file in files if file.startswith('slurm')]

    # create an empty dataframe to store the data
    # df = pd.DataFrame(columns=['pid', 'sid', 'matches', 'cid', 'score', 'score_calc', 'okay_count', 'total_sub'])
    # print(df.head())
    df = pd.DataFrame()

    # iterate through all the files
    for file in files:
        # read the file
        with open(path + file, 'r') as f:
            lines = f.readlines()

        # extract the jobid from the filename
        jobid = re.search(r'slurm-(\d+).out', file).group(1)

        # extract the problem id from the second line
        problem_id = re.search(r'Target Problem (\d+)', lines[1]).group(1)
        # print(file, problem_id)

        # extract the data from the remaining lines
        data = [line.strip().split(' ') for line in lines[2:]]
        #ignore the lines where the pid is not equal to the problem_id
        data = [line for line in data if line[0] == problem_id]
        # print(data)

        # create a dataframe from the data
        temp_df = pd.DataFrame(data, columns=['pid', 'sid', 'matches', 'cid', 'score', 'score_calc', 'okay_count', 'total_sub'])
        # print(temp_df.head())

        # add the jobid and problem id columns
        # temp_df['jobid'] = jobid
        # temp_df['problem_id'] = problem_id


        # check if the dataframe is empty
        if not temp_df.empty:
            # append the dataframe to the main dataframe
            df = pd.concat([df,temp_df], ignore_index=True)

    return df

path = '../../CSEDM_dataset/test_case_stat/'
df = read_slurm_files(path)
print(df.head())

  pid                               sid   matches  \
0   5  04c32d4d95425f73b3a1d6502aed4d48  00001000   
1   5  04c32d4d95425f73b3a1d6502aed4d48  00000000   
2   5  0503bf609757acf2e75aa8cbc0d8323b  00000000   
3   5  06d801cb636235b298c40029ad9921e7  10101000   
4   5  06d801cb636235b298c40029ad9921e7  01100010   

                                        cid  score score_calc okay_count  \
0  807be3e1de41cfb8d37b0c172c0764707114b054  0.875      0.875          1   
1  dc0bbdcc7e469bd78f5a2d3ad5b0de6c8e831f8c    1.0        1.0          2   
2  59b9d1ebc4ccfd03ce250c4e1f008cd34bb82ecd    1.0        1.0          3   
3  24c471d159142cc63afcc97b4c3ba598693bed5c  0.625      0.625          4   
4  85d549bac6478d36732559a5b341643cf05f11eb  0.625      0.625          5   

  total_sub  
0         1  
1         2  
2         3  
3         4  
4         5  


In [6]:
import os
import pandas as pd

path = '../../CSEDM_dataset/test_case_stat'
file_names = os.listdir(path)
# print(file_names)
total = 0
ressc = 0
resmt = 0
resp = 0
resmtcons = 0
all_prob_pair_dict = {}
pairdf = pd.DataFrame()
for file in file_names:
    # print(file)

    if file[:5] != 'slurm':#only the slurm files contain the output
        continue;

    with open(path + '/' + file,'r') as f:
        lines = f.readlines()
    read_flag = 0
    prev_sid = -1
    prev_score = 1
    prev_matches = ""
    countsc = 0
    countmt = 0
    partial = 0
    countmtcons = 0
    student_info = []
    this_prob_pair_dict = {}
    for line in lines:
        words = line.split()
        if words[0] == 'Target':
            print(words[1], words[2])
            read_flag = 1
            targetpid = words[2]

        if read_flag == 1 and words[0] == targetpid:
            total += 1
            pid, sid, matches, cid, score, score_calc, okay_count, total_sub = words
            score = float(score)
            score_calc = float(score_calc)
            okay_count = int(okay_count)
            total_sub = int(total_sub)
            #for a sid, cid, we need to get the code from CSEDMfile. sid is the student id and cid is the code id. We need to get the code from CSEDMfile
            # print(sid, cid)
            code = CSEDMfile[(CSEDMfile['SubjectID'] == sid) & (CSEDMfile['CodeStateID'] == cid)]['Code'].values[0]
            problemDescription = CSEDMfile[(CSEDMfile['SubjectID'] == sid) & (CSEDMfile['CodeStateID'] == cid)]['prompt'].values[0]

            student_info.append((pid, problemDescription, sid, matches, cid, code, score, score_calc, okay_count, total_sub))

    #print(student_info)
    total = 0
    data = []
    for i in range(len(student_info)):
        pid_i, pd_i, sid_i, matches_i, cid_i, code_i, score_i, score_calc_i, okay_count_i, total_sub_i = student_info[i]
        for j in range(i+1, min(i+2,len(student_info))):
            pid_j, pd_j, sid_j, matches_j, cid_j, code_j, score_j, score_calc_j, okay_count_j, total_sub_j = student_info[j]
            current_change =  (matches_i, matches_j)
            if pid_i != pid_j: continue
            if sid_i != sid_j: continue
            if matches_i == matches_j: continue
            total += 1
            if current_change not in this_prob_pair_dict:
                this_prob_pair_dict[current_change] = 1
            else: this_prob_pair_dict[current_change] += 1
            datum = [pid_i, pd_i, sid_i, matches_i, cid_i, code_i, score_i, score_calc_i, matches_j, cid_j, code_j, score_j, score_calc_j]
            data.append(datum)
    pairData = []
    for i in range(len(data)):
        for j in range(len(data)):
            if i >= j: continue
            pid_i, pd_i, sid_1, matches_i_1, cid_i_1, code_i_1, score_i_1, score_calc_i_1, matches_j_1, cid_j_1, code_j_1, score_j_1, score_calc_j_1 = data[i]
            pid_j, pd_j, sid_2, matches_i_2, cid_i_2, code_i_2, score_i_2, score_calc_i_2, matches_j_2, cid_j_2, code_j_2, score_j_2, score_calc_j_2 = data[j]
            # if matches_i_1 == matches_i_2 and matches_j_1 == matches_j_2:
            pairData.append([pid_i, pd_i, sid_1, matches_i_1, cid_i_1, code_i_1, score_i_1, score_calc_i_1, matches_j_1, cid_j_1, code_j_1, score_j_1, score_calc_j_1, sid_2, matches_i_2, cid_i_2, code_i_2, score_i_2, score_calc_i_2, matches_j_2, cid_j_2, code_j_2, score_j_2, score_calc_j_2, matches_i_1 == matches_i_2 and matches_j_1 == matches_j_2])
    
    tempdf = pd.DataFrame(pairData, columns=['problemID', 'problemDescription',
                                             'studentID_1', 'test_case_verdict_i_1', 'codeID_i_1', 'code_i_1', 'score_i_1', 'score_calc_i_1',  
                                                            'test_case_verdict_j_1', 'codeID_j_1', 'code_j_1', 'score_j_1', 'score_calc_j_1', 
                                             'studentID_2', 'test_case_verdict_i_2', 'codeID_i_2', 'code_i_2', 'score_i_2', 'score_calc_i_2',  
                                                            'test_case_verdict_j_2', 'codeID_j_2', 'code_j_2', 'score_j_2', 'score_calc_j_2', 
                                             'is_similar'])
    pairdf = pd.concat([pairdf, tempdf], ignore_index=True)
    # res = dict(sorted(this_prob_pair_dict.items(), key = lambda x: x[1], reverse = True))
    # # print('Total pair of code changes = %d'%(total*(total-1)/2))
    # # print(res)
    # # res = this_prob_pair_dict
    # similar_pair = 0
    # for key in res:
    #     val = res[key]
    #     if val > 1:
    #         similar_pair += val*(val-1)/2
    # # print('Similar pair of code change = %d'%(similar_pair))
    # #print the number of rows in pairdf
    # print(pairdf.shape[0])
pairdf.to_pickle('dataset.pkl')
#save as csv ignoring index
pairdf.to_csv('dataset.csv', index = False)

Problem 21
Problem 46
Problem 12
Problem 22
Problem 5
Problem 24
Problem 1
Problem 71
Problem 17
Problem 34
Problem 37
Problem 39
Problem 25
Problem 3
Problem 20
Problem 40
Problem 13


In [8]:
print(CSEDMfile.keys())

Index(['SubjectID', 'AssignmentID', 'ProblemID', 'CodeStateID', 'Score_x',
       'Code', 'Code-ast', 'code-astnn', 'code-embedding', 'Score_y',
       'embedding', 'astnn', 'prompt', 'prompt-embedding', 'input'],
      dtype='object')


In [8]:
print(pairdf.__len__)

<bound method DataFrame.__len__ of        problemID                                 problemDescription  \
0             21  Write a function in Java that implements the f...   
1             21  Write a function in Java that implements the f...   
2             21  Write a function in Java that implements the f...   
3             21  Write a function in Java that implements the f...   
4             21  Write a function in Java that implements the f...   
...          ...                                                ...   
488963        13  Write a function in Java that implements the f...   
488964        13  Write a function in Java that implements the f...   
488965        13  Write a function in Java that implements the f...   
488966        13  Write a function in Java that implements the f...   
488967        13  Write a function in Java that implements the f...   

                             studentID_1    test_case_verdict_i_1  \
0       11d8ffa1c1cb79e22b6d94667d3847d9   

In [51]:
def filter_submissions_score_range(low_score, high_score, pid):
    file_score_x = file.loc[file['Score_x']<high_score]
    file_score_x = file_score_x[file_score_x['Score_x']>low_score]
    file_score_x = file_score_x[file_score_x['ProblemID'].isin(pid)]
    pid = file_score_x['ProblemID']
    pid = pid.drop_duplicates()
    print(len(file_score_x), len(pid))
    return (file_score_x, pid)

In [61]:
pid = file['ProblemID']
pid = pid.drop_duplicates()
file_score_20, pid = filter_submissions_score_range(0, .1, pid)
file_score_40, pid = filter_submissions_score_range(.2, .3, pid)
file_score_60, pid = filter_submissions_score_range(.4, .5, pid)
file_score_80, pid = filter_submissions_score_range(.6, .7, pid)
file_score_100, pid = filter_submissions_score_range(.8, .9, pid)

file_score_20, pid = filter_submissions_score_range(0, .1, pid)
file_score_40, pid = filter_submissions_score_range(.2, .3, pid)
file_score_60, pid = filter_submissions_score_range(.4, .5, pid)
file_score_80, pid = filter_submissions_score_range(.6, .7, pid)
file_score_100, pid = filter_submissions_score_range(.8, .9, pid)

911 33
1278 30
958 29
1028 27
874 26
611 26
1126 26
934 26
1027 26
874 26


In [62]:
def save_submission_on_file(file_score_x, label):
    file_score_x = file_score_x.drop_duplicates('ProblemID')
    for index, row in file_score_x.iterrows():
        # print(row['ProblemID'])
        filename = 'problems/csedm-p'+ '{:0>3}'.format(row['ProblemID']) +'-'+label+'-'+str('{:.2f}'.format(row['Score_x']))+'-'+'{:0>3}'.format(row['CodeStateID'])+ '.txt'
        f = open(filename, "w")
        filestr = 'Problem: ' + str(row['prompt']) + '\n'
        filestr = filestr + 'Code:\n' + row['Code']
        f.write(filestr)
        f.close()

In [63]:
save_submission_on_file(file_score_20, label='020')
save_submission_on_file(file_score_40, label='040')
save_submission_on_file(file_score_60, label='060')
save_submission_on_file(file_score_80, label='080')
save_submission_on_file(file_score_100, label='100')

In [32]:
import pandas as pd 
file_partial_score = file.loc[file['Score_x'] < 1]
file_partial_score = file_partial_score.loc[file_partial_score['Score_x'] > 0]
# file_low_score = file_partial_score[file_partial_score['Score_x']<.1]
# file_high_score = file_partial_score[file_partial_score['Score_x']>.9]

# file_full_score = file.loc[file['Score_x']==1]
# file_full_score = file.loc[file['Score_x']==1]
# file_zero_score = file.loc[file['Score_x']==0]
# file_partial_score = file_partial_score.drop_duplicates('ProblemID')
# file_full_score = file_full_score.drop_duplicates('ProblemID')
# file_zero_score = file_zero_score.drop_duplicates('ProblemID')
# file_low_score = file_low_score.drop_duplicates('ProblemID')
# file_high_score = file_high_score.drop_duplicates('ProblemID')
# print(file['Score_x'])
# print(len(file))
# print(len(file_full_score))
# print(len(file_zero_score))
# print(len(file_partial_score))
# print(len(file_zero_score)+len(file_full_score)+len(file_partial_score))
# print(file.keys())
# print(len(file_low_score))
# print(len(file_high_score))

file_score_20 = file_partial_score.loc[file_partial_score['Score_x']<=.2]
print(len(file_score_20))
pid = file_score_20['ProblemID']
pid = pid.drop_duplicates()
print(len(pid))
file_score_40 = file_partial_score[file_partial_score['Score_x']<=.4]
file_score_40 = file_score_40[file_score_40['Score_x']>.2]
file_score_40 = file_score_40[file_score_40['ProblemID'].isin(pid)]
print(len(file_score_40))
pid = file_score_40['ProblemID']
pid = pid.drop_duplicates()
print(len(pid))

2004
44
3172
42


In [31]:
for index, row in file_low_score.iterrows():
    # print(row['ProblemID'])
    filename = 'problems_low/csedm-p'+ '{:0>3}'.format(row['ProblemID']) +'-low-'+'{:0>3}'.format(row['CodeStateID'])+ '.txt'
    f = open(filename, "w")
    filestr = 'Problem: ' + str(row['prompt']) + '\n'
    filestr = filestr + 'Code:\n' + row['Code']
    f.write(filestr)
    f.close()

In [32]:
for index, row in file_high_score.iterrows():
    # print(row['ProblemID'])
    filename = 'problems_high/csedm-p'+ '{:0>3}'.format(row['ProblemID']) +'-high-'+'{:0>3}'.format(row['CodeStateID'])+ '.txt'
    f = open(filename, "w")
    filestr = 'Problem: ' + str(row['prompt']) + '\n'
    filestr = filestr + 'Code:\n' + row['Code']
    f.write(filestr)
    f.close()