In [None]:
import re
import os
import shutil
from pathlib import Path
import itertools
from collections import defaultdict
import pandas as pd
import subprocess
from binascii import a2b_hex
import hashlib

In [None]:
def judge_list(filename_list: list):
    dct = defaultdict(list)
    for filename in filename_list:
        hasher = hashlib.sha512()
        for line in open(filename, 'r'):
            bin = re.sub(r'[ \t\n\r]', '', line).encode()
            hasher.update(bin)
        dct[hasher.digest()].append(filename)
    ret = [dct[k] for k in dct if len(dct[k]) > 1]
    # IPython.embed()
    return ret

In [None]:
HW_ID = "hw3"
SRC_DIR = f"./playground/{HW_ID}/raw_data"
DST_DIR = f"./playground/{HW_ID}/copied_code"
BASE_SUFFIX = "_base"
COPIED_SUFFIX = "_copied"
COMMENT_SUFFIX = "_comment"
TA_USERNAMES = ['Yudong', 'fuqi', 'hjl', 'liumugeng', 'Guo Yaoqi', 'liukb']
TA_USERIDS = [137238,90322,935617,1106922,936531,810106]

# example: 31111978_2_AC_935617(Yudong).py3
pattern = r"([0-9]*)_([0-9]*)_([A-Z]*)_([0-9]*)\((.*)\)\.py3"

In [None]:
xuanke = pd.read_csv('userid_studentid_final.csv',dtype=str)
xuanke.dropna(axis='index', inplace=True)
xuanke['userid'] = xuanke['userid'].astype(int)
xuanke = xuanke.set_index('userid')

In [None]:
score_map = {
    'AC' : 2.0,
    'PE' : 1.0,
    'WA' : 1.0,
    'RE' : 0.0,
    'CE' : 0.0,
    'TLE' : 0.2,
    'WT' : 0.4,
}


In [None]:
lst = os.listdir(SRC_DIR)
dic = {}
for name in lst:
    if name.endswith(".py3"):
        match = re.match(pattern, name)
        SUBID, PROBID, STATUS, USERID, USERNAME = match.groups()
        USERID = int(USERID)
        if USERID in TA_USERIDS: continue
        if USERID not in xuanke.index: continue
        SUBID = int(SUBID)
        if PROBID not in dic:
            dic[PROBID] = {}
        if USERID not in dic[PROBID]:
            dic[PROBID][USERID] = {}
        if STATUS not in dic[PROBID][USERID]:
            dic[PROBID][USERID][STATUS] = []
        dic[PROBID][USERID][STATUS].append((SUBID, name))


In [None]:
for prob_id, username_2_status_subid in dic.items():
    for username, status_2_subid in username_2_status_subid.items():
        max_score = 0.0
        stat = (-1, '')
        for status, subid in status_2_subid.items():
            if status == 'AC': stat = max(subid)
            max_score = max(max_score, score_map[status])
        dic[prob_id][username] = {'score': max_score, 'stat': stat[0], 'name': stat[1]}

In [None]:
prob_ids = sorted(dic.keys())
prob_ids

In [None]:
df = pd.DataFrame(columns=[x+BASE_SUFFIX for x in prob_ids] + [x+COPIED_SUFFIX for x in prob_ids] + [x+COMMENT_SUFFIX for x in prob_ids], index=xuanke.index)

In [None]:
df = xuanke.merge(df, left_index=True, right_index=True)

In [None]:
prob_to_filelist = defaultdict(list)
filename_to_commented = {}

for prob_id, username_2_info in dic.items():
    for username, info in username_2_info.items():
        if info['stat'] == -1: continue
        src_path = os.path.join(SRC_DIR, info['name'])
        prob_to_filelist[prob_id].append(src_path)
        code = "".join(open(src_path).readlines())
        commented = "'''" in code or "#" in code
        df.loc[username, prob_id+COMMENT_SUFFIX] = 'yes' if commented else 'no'

In [None]:
for prob_id, username_2_info in dic.items():
    for username, info in username_2_info.items():
        df.loc[username,prob_id+BASE_SUFFIX] = info['score']
df

In [None]:
for prob_id in prob_ids:
    filename_lst = prob_to_filelist[prob_id]
    same_lst = judge_list(filename_lst)
    print(prob_id)
    prob_dst_dir = os.path.join(DST_DIR, prob_id)
    if os.path.exists(prob_dst_dir):
        shutil.rmtree(prob_dst_dir)
    for i, lst in enumerate(same_lst):
        print(f"  list{i}:")
        for name in lst:
            final_name = name.split('/')[-1]
            match = re.match(pattern, final_name)
            SUBID, PROBID, STATUS, USERID, USERNAME = match.groups()
            df.loc[int(USERID), prob_id+COPIED_SUFFIX] = "yes-%03d" % i
            
            dst_dir = os.path.join(prob_dst_dir, "%03d" % i)
            Path(dst_dir).mkdir(exist_ok=True, parents=True)
            shutil.copy(name, dst_dir)
            print(f"    {final_name}")
# df

In [None]:
for x in prob_ids:
    col = x + COMMENT_SUFFIX
    df[col] = df[col].fillna('N/A')
    
    col = x + COPIED_SUFFIX
    df[col] = df[col].fillna('no')
    
    col = x + BASE_SUFFIX
    df[col] = df[col].fillna(0.0)
df

In [None]:
df.to_csv(f'{HW_ID}_result.csv')