# Notebook para processar dados dos anos do enem.

In [1]:
import fitz
from typing import List,Dict
import re
import json
from unidecode import unidecode
from pypdf import PdfReader
import time
import subprocess

## Functions to process all the data

In [2]:
def concat_lines_questions(
    text_page: str
):
    """
    Function to concat the lines of the text and aggregate the lines in questions
    
    Arguments
    -----------------
        text_page (str): string that contains the text relative to the page
    """
    questions = []
    
    aux = []
    start_question= False
    for line in text_page.split('\n'):
        
        line_aux = line.lower().strip()
        if line_aux.startswith("questão") and len(line_aux) < 30:
            
            start_question = True
            if len(aux) != 0:
                questions.append(aux)
            aux = [line]
        elif start_question:
            aux.append(line)

    if len(aux) != 0:
        questions.append(aux)

    return questions

def concat_numbers(
    questions: List[List[str]]
):
    """
    Function to concat the number and the text question, line: Questão 89
    
    Arguments:
    -------------
        questions (List[List[str]]): text relative to the questions
    """
    new_questions = []
    for question in questions:
        if question[0].lower().startswith("questão ") and question[1].strip().isnumeric():
            new_question = [question[0] + " "+ question[1].strip()] + question[2:]
            new_questions.append(new_question)
        else:
            new_questions.append(question)
    return new_questions

def identify_alternatives(
    questions: List[List[str]]
):
    """
    Function to identify the alternatives of the questions
    
    
    Arguments
    ---------------------
        questions (List[List[str]]): text relative to the questions
    """
    new_questions = []
    
    for question in questions:
        
        alternatives = []
        
        aux = ""
        index_alternatives = None
        for index in range(len(question)-1,-1,-1):
            
            aux = question[index] + " " + aux
            if question[index][0] in ["A","B","C","D","E","F","G"] and question[index][1] == ' ':
                aux = aux[0] + ")" + aux[1:]
                
                aux = aux.strip()
                aux = re.sub(" +"," ",aux)
                
                alternatives = [aux] + alternatives
                aux = ""
            if question[index][0] == "A" and question[index][1] == ' ':
                index_alternatives = index
                break
        
        new_questions.append({"description":question[:index_alternatives], "alternatives": alternatives})
        
    return new_questions
    
    
def ignore_lines(
    questions: List[List[str]]
):
    """
    Function to ignore some lines that contains information that are not usefull for the experiment
    
    Arguments
    --------------
        questions (List[List[str]]): text relative to the questions
    """
    
    regexes = ["\*[A-Z0-9]*\*","[A-Z] *"]
    values = ['',' ']
    some = [
        'The English Enlightenment',
        'SILVA','São Paulo: Livraria',
        '2006. p. 169 (adaptado)',
        'LINGUAGENS, CÓDIGOS',
        'E SUAS TECNOLOGIAS',
        ' 01 a 05 (opção espanhol)',
        'de 01 a 45',
        'Disponível em: ',
        '(adaptado)',
        '*amar25',
        'd h PROIBIDO ESTACIONAR',
        "| Caderno"
    ]
    
    new_questions = []
    for question in questions:
        question = list(filter(lambda x: not(any([re.fullmatch(r,x) for r in regexes])), question))
        question = list(filter(lambda x: not(any([v == x for v in values])), question))
        question = list(filter(lambda x: not(any([v in x for v in some])), question))
        new_questions.append(question)
    
    return new_questions
              
    
def concat_description(
   questions: List[Dict]
):
    """
    Function to concat the text relative to the description of the question
    
    Arguments
    ----------------------
        questions (List[Dict]): a list containing a dict of the questions
    """
    new_questions = []
    
    for question in questions:
        
        aux = question['description'][0].strip() + ")"
        for line in question['description'][1:]:
            if aux[-1] == ".":
                aux += "\n"
            else:
                aux += " "
            aux += line
        
        question['description'] = re.sub(" +"," ",aux.strip())
        new_questions.append(question)
    return new_questions


    
def put_year_subject_questions(
    questions: List[Dict],
    year: str
):
    """
    Function that puts the subject and the year of each question
    
    Arguments
    ----------------------
        questions (List[Dict]): a list containing a dict of the questions
    """
    new_questions = []
    if int(year) <= 2016:
        
        for index,question in enumerate(questions):
            question['year'] = year
            if index < 45:
                question['subject'] = "Ciências humanas e suas tecnologias"
            elif index < 90:
                question['subject'] = "Ciências da natureza e suas tecnologias"
            elif index < 140:
                question['subject'] = "Linguagens, códigos e suas tecnologias"
            else:
                question['subject'] = "Matemática e suas tecnologias"
            new_questions.append(question)
        
    else:
        
        for index,question in enumerate(questions):
            question['year'] = year
            if index < 50:
                question['subject'] = "Linguagens, códigos e suas tecnologias"
            elif index < 95:
                question['subject'] = "Ciências humanas e suas tecnologias"
            elif index < 140:
                question['subject'] = "Ciências da natureza e suas tecnologias"
            else:
                question['subject'] = "Matemática e suas tecnologias"
            new_questions.append(question)
    return new_questions
    
    
    
def process_exam(
    path_exam: str
):
    """
    Function to aggregate the code to read and process each question
    
    Arguments:
    ------------
        path_exam (str): path to read the pdf of the question
    """
    reader = fitz.open(path_exam)
    
    questions = []
    for index, page in enumerate(reader):
        questions.extend(concat_lines_questions(page.get_text()))
    
    questions = ignore_lines(questions)
    questions = concat_numbers(questions)
    questions = identify_alternatives(questions)
    questions = concat_description(questions)
    
    
    return questions
    

def filter_questions(
    questions: List[Dict]
):
    """
    Function to filter the questions, we will not consider questions that have: images, equations and tables.
    
    Arguments:
    ---------------------
        questions (List[Dict]): a list containing a dict of the questions
    """
    def not_have_string_heuristic(question : Dict):
        strings = [
            "�",
            "esquema",
            "imagem",
            "tabela",
            "gráfico",
            'figura',
            'charge',
            'pôster',
            'anúncio publicitário',
            'fotografia',
            'imagens',
            'gráfico',
            'quadro',
            'fórmula',
            'fórmulas',
            'equação',
            'mapa',
            'organograma',
            'ilustrado',
            'tira',
            'tirinha',
            'cartum',
            'cartaz',
            'peça publicitária',
            'obra',
            'ilustração',
            'mapa',
            'obra',
            'quadrinho',
            'representado por',
            'representado em'
        ]
        aux = True
        for s in strings:
            if s in question['description'].lower():
                aux = False
            for a in question['alternatives']:
                if s in a.lower():
                    aux = False
        return aux

    def not_have_enough_alternatives_heuristic(question: Dict):
        return not(len(question['alternatives']) < 5)
    
    def remove_anulled_question(question: Dict):
        return not(question['ground_truth'] == "X")
    
    heuristics_filter = [
        not_have_string_heuristic,
        not_have_enough_alternatives_heuristic,
        remove_anulled_question
    ]
    
    for h in heuristics_filter:
        questions = list(filter(h,questions))
    
    return questions

def put_ground_truth_questions(questions: List[Dict], year: str):
    """
    Function to consume the json with the ground truths and put in the questions:
    
    Arguments:
    ---------------------
        questions (List[Dict]): a list containing a dict of the questions
    """
    with open("data/ground_truth/ground_truth.json","r") as file:
        json_responses = json.loads(file.read())
        
    responses_year = json_responses[year]
    
    new_questions = []
    for index,question in enumerate(questions):
        index_str = str(index+1)
        question['ground_truth'] = responses_year[index_str]
        question['index_question'] = index_str
        new_questions.append(question)
    return new_questions
    
    
    
def process_all_exams(
    exams: List[Dict]
):
    """
    Function where all the code will be called, process all the questions from the exams
    
    Arguments
    ---------------
        exams (List[Dict]): all the pdfs from the exams
    """
    for year, exams in exams.items():
        day1_questions = process_exam(f"data/raw/{year}/{exams['day1']}")
        day2_questions = process_exam(f"data/raw/{year}/{exams['day2']}")
        
        all_questions = day1_questions + day2_questions
        
        all_questions = put_year_subject_questions(all_questions, year)
        # put the ground truth
        all_questions = put_ground_truth_questions(all_questions, year)
        # filter the questions
        all_questions = filter_questions(all_questions)
        
        
        print(f"-> Ano: {year} | Questões: {len(all_questions)}")
        with open(f"data/cleaned/{year}.json","w") as f:
            f.write(json.dumps(all_questions,indent=2,ensure_ascii=False))

In [3]:
all_exams = {
    
    "2010" : { 
        "day1": "dia1_caderno1_azul.pdf", 
        "day2": "dia2_caderno7_azul.pdf"
    },
    "2011" : { 
        "day1": "dia1_caderno1_azul.pdf", 
        "day2": "dia2_caderno5_amarelo.pdf"
    },
    "2012" : { 
        "day1": "4a6fb7236b8588c5e7bcd1c5f24132d0.pdf", 
        "day2": "8411c969b9bbfc9c08454fb1be37198e.pdf"
    },
    "2013" : { 
        "day1": "e0e555097eefaba6c44f690496b7314f.pdf", 
        "day2": "36b9b0bbcffef377b6b6d1fd70b851a4.pdf"
    },
    "2014" : { 
        "day1": "2014_PV_impresso_D1_CD1.pdf", 
        "day2": "2014_PV_impresso_D2_CD5.pdf"
    },
    "2015" : { 
        "day1": "2015_PV_impresso_D1_CD1.pdf", 
        "day2": "2015_PV_impresso_D2_CD5.pdf"
    },
    "2016" : { 
        "day1": "2016_PV_impresso_D1_CD1.pdf", 
        "day2": "2016_PV_impresso_D2_CD5.pdf"
    },
    "2017" : { 
        "day1": "2017_PV_impresso_D1_CD1.pdf", 
        "day2": "2017_PV_impresso_D2_CD5.pdf"
    },
    "2018" : { 
        "day1": "2018_PV_impresso_D1_CD1.pdf", 
        "day2": "2DIA_05_AMARELO_BAIXA.pdf"
    },
    "2019" : { 
        "day1": "caderno-de-questoes-1-dia-caderno-1-azul-aplicacao-regular.pdf", 
        "day2": "caderno-de-questoes--2-dia-caderno-5-amarelo-aplicacao-regular.pdf"
    },
    "2020" : { 
        "day1": "1-dia-caderno1-azul-prova.pdf", 
        "day2": "2-dia-caderno5-amarelo-prova.pdf"
    },
    "2021" : { 
        "day1": "2021_PV_impresso_D1_CD1.pdf", 
        "day2": "2021_PV_impresso_D2_CD5.pdf"
    },
    "2022" : { 
        "day1": "1-dia-caderno-1-azul-enem-2022.pdf", 
        "day2": "2-dia-caderno-7-azul-enem-2022.pdf"
    }
}

process_all_exams(all_exams)

-> Ano: 2010 | Questões: 0
-> Ano: 2011 | Questões: 93
-> Ano: 2012 | Questões: 100
-> Ano: 2013 | Questões: 94
-> Ano: 2014 | Questões: 67
-> Ano: 2015 | Questões: 99
-> Ano: 2016 | Questões: 113
-> Ano: 2017 | Questões: 115
-> Ano: 2018 | Questões: 55
-> Ano: 2019 | Questões: 110
-> Ano: 2020 | Questões: 99
-> Ano: 2021 | Questões: 0
-> Ano: 2022 | Questões: 61
