In [None]:
#@title Download Dataset to Google Colab { form-width: "800px" }

# e.g GOOGLE_DRIVE_FILE_URL: https://drive.google.com/open?id=1CZyDkHHjS8SUVBF7fBwoTAF6xSpLmD4P7
LinkFile = "https://drive.google.com/open?id=1M-cscNtvNxUoTX2MhZNnZZtURkh_DcQG" #@param {type:"string"}

# e.g GOOGLE_DRIVE_FILENAME_WITH_EXTENSION: Local History.csv
Filename = "codebench.zip" #@param {type:"string"}

Filename = Filename.replace(' ', '\ ')

# Install the PyDrive wrapper & import libraries.
# This only needs to be done once per notebook.
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Download a file based on its file ID.
#
# A file ID looks like: laggVyWshwcyP6kEI-y_W3P8D26sz
file_id = LinkFile.split('=')[-1]
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile(Filename) 

In [None]:
!unzip -q codebench.zip
!rm codebench.zip
!rm -rf sample_data/

## Extraíndo os dados dos assessmentes e guardando em uma estrtutura

In [None]:
import os
import re

from pprint import pprint

assessments = {} 

def extract_feature_value(line, semester_id, class_id, assessment_id):
    data_grade = re.search(r"(\-{4}\s)(.+)(:\s)(.+)", line)
    if data_grade is not None:
        feature = data_grade.group(2)
        value = data_grade.group(4)
        feature = feature.replace(' ', '_')
        assessments[semester_id][class_id][assessment_id][feature] = value
        return feature, value
    return None

def reading_assessment_file(path,  semester_id, class_id, assessment_id):
    features, values, = [], []
    with open(path, "r") as fl:
        for line in fl:
            f_v = extract_feature_value(line, semester_id, class_id, assessment_id)
            if f_v is not None:
                features.append(f_v[0])
                values.append(f_v[1])
    return features, values

def main():
    path = "codebench/"
    for semester_id in os.listdir(path):
        path_semester = os.path.join(path, semester_id)
        assessments[semester_id] = {}
        for class_id in os.listdir(path_semester):
            path_class = os.path.join(path_semester, class_id, "assessments")
            assessments[semester_id][class_id] = {}
            for assessment_id in os.listdir(path_class):
                path_assessment = os.path.join(path_class, assessment_id)
                assessment_id = assessment_id.replace(".data", "")
                assessments[semester_id][class_id][assessment_id] = {}
                features, values = reading_assessment_file(path_assessment, 
                                                            semester_id,
                                                            class_id,
                                                            assessment_id)
if __name__ == "__main__":
    main()
    pprint(assessments)                

# Dados dos exercícios do tipo de homework


## Extraindo a quantidade de exercícios total do tipo homework por turma

In [None]:
import os
import re

def total_homework_per_class(semester_id, class_id):
    total = 0
    for assessment_id in assessments[semester_id][class_id].keys():
        if assessments[semester_id][class_id][assessment_id]["type"] == "homework":
            total += int(assessments[semester_id][class_id][assessment_id]["total_exercises"])
    return total

def main():
    path = "codebench/"
    for semester_id in os.listdir(path):
        path_semester = os.path.join(path, semester_id)
        for class_id in os.listdir(path_semester):
            total_homework = total_homework_per_class(semester_id, class_id)
            print(f"semester_id={semester_id}, class_id={class_id}, total_execises_homework={total_homework}")
            return

if __name__ == "__main__":
    main()

## Extraindo a quantidade de exercicíos feitos por cada aluno do tipo homework

In [None]:
import os
import re

def homeworks_class(semester_id, class_id):
    homeworks = []
    for assessment_id in assessments[semester_id][class_id].keys():
        for (key, value) in assessments[semester_id][class_id][assessment_id].items():
            if value == "homework":
                homeworks.append(assessment_id)
    return homeworks

def exercise_isdone(file_path):
    issubmition = False
    with open(file_path, "r") as file:
        for line in file.readlines():
            if re.match(r"-- GRADE:", line):
                issubmition = True
            if issubmition == True and re.match(r"100\%", line):
                return True
    return False

def total_exercisedone(path, homeworks):
    total_exercise = 0
    for execution in os.listdir(path):
        assessment_id = re.sub(r"\_\d+.log", "", execution)
        if assessment_id in homeworks:
            file_path = os.path.join(path, execution)
            if exercise_isdone(file_path):
                total_exercise += 1
    return total_exercise

def executions_homework_process(path, homeworks):
    te = total_exercisedone(path, homeworks)
    #print(f"total_exercise_homework_done={te}")
    return te

def main():
    path = "codebench/"
    for semester_id in os.listdir(path):
        path_semester = os.path.join(path, semester_id)
        for class_id in os.listdir(path_semester):
            path_class = os.path.join(path_semester, class_id, "users")
            homeworks = homeworks_class(semester_id, class_id)
            for user_id in os.listdir(path_class):
                path_executions = os.path.join(path_class, user_id, "executions")
                executions_homework_process(path_executions, homeworks)
            return

if __name__ == "__main__":
    main()

## Extraindo a quantidades de submissões dos exercícios do tipo homework

In [None]:
def ishomework(semester_id, class_id, assessment_id):
    if assessments[semester_id][class_id][assessment_id]["type"] == "homework": 
        return True
    else:
        return False

def total_submition(path):
    total = 0
    with open(path, "r") as submition:
        for line in submition.readlines():
            if re.match(r"== SUBMITION (\(.+)\)", line):
                total += 1
    return total

def total_submitions_homework(path, semester_id, class_id):
    ts = 0
    for execution_id in os.listdir(path):
        path_execution = os.path.join(path, execution_id)
        execution_id = re.sub("\_\d+\.log", "", execution_id)
        if ishomework(semester_id, class_id, execution_id):
            ts += total_submition(path_execution)
    return ts

def main():
    path = "codebench/"
    for semester_id in os.listdir(path):
        path_semester = os.path.join(path, semester_id)
        for class_id in os.listdir(path_semester):
            path_class = os.path.join(path_semester, class_id, "users")
            for user_id in os.listdir(path_class):
                path_executions = os.path.join(path_class, user_id, "executions")
                total_submitions_homework(path_executions, semester_id, class_id)
                return

if __name__ == "__main__":
    main()

## Extraindo a quantidade de testes feitos em exercícos do tipo homework

In [None]:
def ishomework(semester_id, class_id, assessment_id):
    if assessments[semester_id][class_id][assessment_id]["type"] == "homework": 
        return True
    else:
        return False

def total_test(path):
    total = 0
    with open(path, "r") as submition:
        for line in submition.readlines():
            if re.match(r"== TEST (\(.+)\)", line):
                total += 1
    return total

def total_test_homeworks(path, semester_id, class_id):
    ts = 0
    for execution_id in os.listdir(path):
        path_execution = os.path.join(path, execution_id)
        execution_id = re.sub("\_\d+\.log", "", execution_id)
        if ishomework(semester_id, class_id, execution_id):
            ts += total_test(path_execution)
    return ts

def main():
    path = "codebench/"
    for semester_id in os.listdir(path):
        path_semester = os.path.join(path, semester_id)
        for class_id in os.listdir(path_semester):
            path_class = os.path.join(path_semester, class_id, "users")
            for user_id in os.listdir(path_class):
                path_executions = os.path.join(path_class, user_id, "executions")
                total_test_homeworks(path_executions, semester_id, class_id)
                return

if __name__ == "__main__":
    main()

## Soma dos tempos médios de submissões dos exercícios do tipo homework

In [None]:
import re
import os

from datetime import datetime

def ishomework(semester_id, class_id, assessment_id):
    if assessments[semester_id][class_id][assessment_id]["type"] == "homework": 
        return True
    else:
        return False

def to_datetime(date_times):
    _date_times = []
    for date_time in date_times:
        date_time = datetime.strptime(date_time,'%Y-%m-%d %H:%M:%S')
        _date_times.append(date_time)
    return _date_times

def sum_of_variation(date_times):
    sov = 0
    ant = date_times[-1]
    for item in reversed(date_times):
        diff = ant - item
        ant = item
        sov += diff.seconds
    return sov

def mean_variation_time(date_times):
    if len(date_times) > 1:
        date_times = to_datetime(date_times)
        sov = sum_of_variation(date_times)
        mov = sov / len(date_times)
        return mov
    else:
        return 0

def extract_time_exercise(path):
    date_times = []
    with open(path, "r") as submition:
        for line in submition.readlines():
            match = re.match(r"== SUBMITION \((.+)\)", line)
            if match is not None:
                date_times.append(match.group(1))
    return date_times

def mean_time_submition_homework(path_executions, semester_id, class_id):
    sum_mov = 0
    for execution_id in os.listdir(path_executions):
        path_execution = os.path.join(path_executions, execution_id)
        assessment_id = re.sub("\_\d+\.log", "", execution_id)
        if ishomework(semester_id, class_id, assessment_id):
            date_times = extract_time_exercise(path_execution)
            mov = mean_variation_time(date_times)
            sum_mov += mov
    return round(sum_mov, 2)

def main():
    path = "codebench/"
    for semester_id in os.listdir(path):
        path_semester = os.path.join(path, semester_id)
        for class_id in os.listdir(path_semester):
            path_class = os.path.join(path_semester, class_id, "users")
            for user_id in os.listdir(path_class):
                path_executions = os.path.join(path_class, user_id, "executions")
                mean_time_submition_homework(path_executions, semester_id, class_id)
            return

if __name__ == "__main__":
    main()

## Soma dos tempos médios de testes feitos em exercícos do tipo homeworks

In [None]:
import re
import os

from datetime import datetime

def ishomework(semester_id, class_id, assessment_id):
    if assessments[semester_id][class_id][assessment_id]["type"] == "homework": 
        return True
    else:
        return False

def extract_time_exercise(path):
    date_times = []
    with open(path, "r") as submition:
        for line in submition.readlines():
            match = re.match(r"== TEST \((.+)\)", line)
            if match is not None:
                date_times.append(match.group(1))
    return date_times

def to_datetime(date_times):
    _date_times = []
    for date_time in date_times:
        date_time = datetime.strptime(date_time,'%Y-%m-%d %H:%M:%S')
        _date_times.append(date_time)
    return _date_times

def sum_of_variation(date_times):
    sov = 0
    ant = date_times[-1]
    for item in reversed(date_times):
        diff = ant - item
        ant = item
        sov += diff.seconds
    return sov

def mean_variation_time(date_times):
    if len(date_times) > 1:
        date_times = to_datetime(date_times)
        sov = sum_of_variation(date_times)
        mov = sov / len(date_times)
        return mov
    else:
        return 0

def mean_time_test_homeworks(path_executions, semester_id, class_id):
    sum_mov = 0
    for execution_id in os.listdir(path_executions):
        path_execution = os.path.join(path_executions, execution_id)
        assessment_id = re.sub("\_\d+\.log", "", execution_id)
        if ishomework(semester_id, class_id, assessment_id):
            date_times = extract_time_exercise(path_execution)
            mov = mean_variation_time(date_times)
            sum_mov += mov
    return round(sum_mov, 2)

def main():
    path = "codebench/"
    for semester_id in os.listdir(path):
        path_semester = os.path.join(path, semester_id)
        for class_id in os.listdir(path_semester):
            path_class = os.path.join(path_semester, class_id, "users")
            for user_id in os.listdir(path_class):
                path_executions = os.path.join(path_class, user_id, "executions")
                mean_time_test_homeworks(path_executions, semester_id, class_id)
            return

if __name__ == "__main__":
    main()

# Dados dos exercícios do tipo exam

## Extraindo a quantidade total de exercícios do tipo exam por turma

In [None]:
import os
import re

def total_exam_per_class(semester_id, class_id):
    total = 0
    for assessment_id in assessments[semester_id][class_id].keys():
        if assessments[semester_id][class_id][assessment_id]["type"] == "exam":
            total += int(assessments[semester_id][class_id][assessment_id]["total_exercises"])
    return total

def main():
    path = "codebench/"
    for semester_id in os.listdir(path):
        path_semester = os.path.join(path, semester_id)
        for class_id in os.listdir(path_semester):
            total_exam = total_exam_per_class(semester_id, class_id)
            print(f"semester_id={semester_id}, class_id={class_id}, total_exercises_exam={total_exam}")
            return

if __name__ == "__main__":
    main()

## Extraindo a quantidade de exercícios feitos por cada usuário do tipo exam



In [None]:
import os
import re

def exams_class(semester_id, class_id):
    exams = []
    for assessment_id in assessments[semester_id][class_id].keys():
        for (key, value) in assessments[semester_id][class_id][assessment_id].items():
            if value == "exam":
                exams.append(assessment_id)
    return exams

def exercise_isdone(file_path):
    issubmition = False
    with open(file_path, "r") as file:
        for line in file.readlines():
            if re.match(r"-- GRADE:", line):
                issubmition = True
            if issubmition == True and re.match(r"100\%", line):
                return True
    return False

def total_exercise_done_exam(path, exams):
    total_exercise = 0
    for execution in os.listdir(path):
        assessment_id = re.sub(r"\_\d+.log", "", execution)
        if assessment_id in exams:
            file_path = os.path.join(path, execution)
            if exercise_isdone(file_path):
                total_exercise += 1
    return total_exercise

def main():
    path = "codebench/"
    for semester_id in os.listdir(path):
        path_semester = os.path.join(path, semester_id)
        for class_id in os.listdir(path_semester):
            path_class = os.path.join(path_semester, class_id, "users")
            exams = exams_class(semester_id, class_id)
            for user_id in os.listdir(path_class):
                path_executions = os.path.join(path_class, user_id, "executions")
                total_exercise_done_exam(path_executions, exams)
                return

if __name__ == "__main__":
    main()

## Extraindo a quantidade de submissões dos exercícios do tipo exam por aluno

In [None]:
def total_submition(path):
    total = 0
    with open(path, "r") as submition:
        for line in submition.readlines():
            if re.match(r"== SUBMITION (\(.+)\)", line):
                total += 1
    return total

def isexam(semester_id, class_id, assessment_id):
    if assessments[semester_id][class_id][assessment_id]["type"] == "exam": 
        return True
    else:
        return False

def total_submitions_exam(path, semester_id, class_id):
    ts = 0
    for execution_id in os.listdir(path):
        path_execution = os.path.join(path, execution_id)
        execution_id = re.sub("\_\d+\.log", "", execution_id)
        if isexam(semester_id, class_id, execution_id):
            ts += total_submition(path_execution)
    return ts

def main():
    path = "codebench/"
    for semester_id in os.listdir(path):
        path_semester = os.path.join(path, semester_id)
        for class_id in os.listdir(path_semester):
            path_class = os.path.join(path_semester, class_id, "users")
            for user_id in os.listdir(path_class):
                path_executions = os.path.join(path_class, user_id, "executions")
                total_submitions_exam(path_executions, semester_id, class_id)
            return

if __name__ == "__main__":
    main()

## Extraindo a quantidade de testes feitos em exercícos do tipo exam por aluno

In [None]:
def ishomework(semester_id, class_id, assessment_id):
    if assessments[semester_id][class_id][assessment_id]["type"] == "homework": 
        return True
    else:
        return False

def total_test(path):
    total = 0
    with open(path, "r") as submition:
        for line in submition.readlines():
            if re.match(r"== TEST (\(.+)\)", line):
                total += 1
    return total

def total_test_exams(path, semester_id, class_id):
    ts = 0
    for execution_id in os.listdir(path):
        path_execution = os.path.join(path, execution_id)
        execution_id = re.sub("\_\d+\.log", "", execution_id)
        if isexam(semester_id, class_id, execution_id):
            ts += total_test(path_execution)
    return ts

def main():
    path = "codebench/"
    for semester_id in os.listdir(path):
        path_semester = os.path.join(path, semester_id)
        for class_id in os.listdir(path_semester):
            path_class = os.path.join(path_semester, class_id, "users")
            for user_id in os.listdir(path_class):
                path_executions = os.path.join(path_class, user_id, "executions")
                total_test_exams(path_executions, semester_id, class_id)
                return

if __name__ == "__main__":
    main()

## Soma dos tempos médios de submissões de exercícios do tipo exam por aluno

In [None]:
import re
import os

from datetime import datetime

def isexam(semester_id, class_id, assessment_id):
    if assessments[semester_id][class_id][assessment_id]["type"] == "exam": 
        return True
    else:
        return False

def to_datetime(date_times):
    _date_times = []
    for date_time in date_times:
        date_time = datetime.strptime(date_time,'%Y-%m-%d %H:%M:%S')
        _date_times.append(date_time)
    return _date_times

def sum_of_variation(date_times):
    sov = 0
    ant = date_times[-1]
    for item in reversed(date_times):
        diff = ant - item
        ant = item
        sov += diff.seconds
    return sov

def mean_variation_time(date_times):
    if len(date_times) > 1:
        date_times = to_datetime(date_times)
        sov = sum_of_variation(date_times)
        mov = sov / len(date_times)
        return mov
    else:
        return 0

def extract_time_exercise(path):
    date_times = []
    with open(path, "r") as submition:
        for line in submition.readlines():
            match = re.match(r"== SUBMITION \((.+)\)", line)
            if match is not None:
                date_times.append(match.group(1))
    return date_times

def mean_time_submition_exam(path_executions, semester_id, class_id):
    sum_mov = 0
    for execution_id in os.listdir(path_executions):
        path_execution = os.path.join(path_executions, execution_id)
        assessment_id = re.sub("\_\d+\.log", "", execution_id)
        if isexam(semester_id, class_id, assessment_id):
            date_times = extract_time_exercise(path_execution)
            mov = mean_variation_time(date_times)
            sum_mov += mov
    return round(sum_mov, 2)

def main():
    path = "codebench/"
    for semester_id in os.listdir(path):
        path_semester = os.path.join(path, semester_id)
        for class_id in os.listdir(path_semester):
            path_class = os.path.join(path_semester, class_id, "users")
            for user_id in os.listdir(path_class):
                path_executions = os.path.join(path_class, user_id, "executions")
                mean_time_submition_exam(path_executions, semester_id, class_id)
            return

if __name__ == "__main__":
    main()

## Soma dos tempos médios de testes de exercícios do tipo exam por aluno

In [None]:
import re
import os

from datetime import datetime

def isexam(semester_id, class_id, assessment_id):
    if assessments[semester_id][class_id][assessment_id]["type"] == "exam": 
        return True
    else:
        return False

def to_datetime(date_times):
    _date_times = []
    for date_time in date_times:
        date_time = datetime.strptime(date_time,'%Y-%m-%d %H:%M:%S')
        _date_times.append(date_time)
    return _date_times

def sum_of_variation(date_times):
    sov = 0
    ant = date_times[-1]
    for item in reversed(date_times):
        diff = ant - item
        ant = item
        sov += diff.seconds
    return sov

def mean_variation_time(date_times):
    if len(date_times) > 1:
        date_times = to_datetime(date_times)
        sov = sum_of_variation(date_times)
        mov = sov / len(date_times)
        return mov
    else:
        return 0

def extract_time_exercise(path):
    date_times = []
    with open(path, "r") as submition:
        for line in submition.readlines():
            match = re.match(r"== TEST \((.+)\)", line)
            if match is not None:
                date_times.append(match.group(1))
    return date_times

def mean_time_test_exams(path_executions, semester_id, class_id):
    sum_mov = 0
    for execution_id in os.listdir(path_executions):
        path_execution = os.path.join(path_executions, execution_id)
        assessment_id = re.sub("\_\d+\.log", "", execution_id)
        if isexam(semester_id, class_id, assessment_id):
            date_times = extract_time_exercise(path_execution)
            mov = mean_variation_time(date_times)
            sum_mov += mov
    return round(sum_mov, 2)


def main():
    path = "codebench/"
    for semester_id in os.listdir(path):
        path_semester = os.path.join(path, semester_id)
        for class_id in os.listdir(path_semester):
            path_class = os.path.join(path_semester, class_id, "users")
            for user_id in os.listdir(path_class):
                path_executions = os.path.join(path_class, user_id, "executions")
                mean_time_test_exams(path_executions, semester_id, class_id)
            return

if __name__ == "__main__":
    main()

# Função principal

In [None]:
import re
import os
import csv

from pprint import pprint 
dataset = [
    ["user_id", "semester_id", "class_id", "total_homework", "total_done_homework",
    "total_subm_homework", "total_test_homework", "mean_time_subm_homework",
    "mean_time_test_homework", "total_exam", "total_done_exam", "total_subm_exam",
    "total_test_exam", "mean_time_subm_exam", "mean_time_test_exam"]
]

def main():
    path = "codebench/"
    for semester_id in os.listdir(path):
        path_semester = os.path.join(path, semester_id)
        for class_id in os.listdir(path_semester):
            path_class = os.path.join(path_semester, class_id, "users")
            
            total_exam = total_exam_per_class(semester_id, class_id)
            total_homework = total_homework_per_class(semester_id, class_id)

            exams = exams_class(semester_id, class_id)
            homeworks = homeworks_class(semester_id, class_id)

            for user_id in os.listdir(path_class):
                path_executions = os.path.join(path_class, user_id, "executions")

                total_done_homework = executions_homework_process(path_executions, homeworks)
                total_subm_homework = total_submitions_homework(path_executions,semester_id, class_id)
                total_test_homework = total_test_homeworks(path_executions, semester_id, class_id)
                mean_time_subm_homework = mean_time_submition_homework(path_executions, semester_id, class_id)
                mean_time_test_homework = mean_time_test_homeworks(path_executions, semester_id, class_id)
                total_done_exam = total_exercise_done_exam(path_executions, exams)
                total_subm_exam = total_submitions_exam(path_executions, semester_id, class_id)
                total_test_exam = total_test_exams(path_executions, semester_id, class_id)
                mean_time_subm_exam = mean_time_submition_exam(path_executions, semester_id, class_id)
                mean_time_test_exam = mean_time_test_exams(path_executions, semester_id, class_id)

                register = [user_id, semester_id, class_id, total_homework, total_done_homework,
                 total_subm_homework, total_test_homework, mean_time_subm_homework,
                 mean_time_test_homework, total_exam, total_done_exam, total_subm_exam,
                 total_test_exam, mean_time_subm_exam, mean_time_test_exam]

                dataset.append(register)

if __name__ == "__main__":
    main()

In [None]:
import csv

with open('dataset.csv', mode='w') as employee_file:
    employee_writer = csv.writer(employee_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for register in dataset:
        employee_writer.writerow(register)