In [30]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('portuguese'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\b\w{1,2}\b', '', text)  
    text = re.sub(r'[^\w\s]', '', text) 
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

url = "../data/data_playlists_enem.csv"
df = pd.read_csv(url)
df['content'] = df['content'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gabri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer() # Important note here: there are many default parameters that do not show in this call.
X = vectorizer.fit_transform(df['content'])

In [57]:
import pickle
from pydantic import BaseModel

nltk.download('stopwords')
stop_words = set(stopwords.words('portuguese'))

class Output(BaseModel):
    content: str
    relevance: float

class Retriever:
    def __init__(self, path_csv, path_model):
        self.df = pd.read_csv(path_csv)
        self.df['content'] = self.df['content'].apply(self.preprocess_text)
        try:
            with open(path_model, 'rb') as f_in:
                self.vectorizer = pickle.load(f_in)
            self.X = self.vectorizer.transform(self.df['content'])    
            
        except:
            print('Model not found. Creating a new one...')
            self.vectorizer = TfidfVectorizer()
            self.X = self.vectorizer.fit_transform(self.df['content'])
            print('Model created.')
            print('Saving model...')
            self.save_model(filename=path_model)
            
        
    def invoke(self, query, k=3):
        Q = self.vectorizer.transform([query])
        R = self.X.dot(Q.T)
        
        # Get score with np.argsort
        scores = np.array(R.toarray()).flatten()
        idxs = np.argsort(scores)[::-1]
        idxs_and_scores = np.array([[idx, scores[idx]] for idx in idxs])
        
        retrieved = np.array(idxs_and_scores[:k])
        idxs = np.array(retrieved)[:, 0].astype(int)
        output_content = self.df.iloc[idxs][['content']].values
        output_relevance = retrieved[:, 1].reshape(-1, 1)
        output = np.concatenate([output_content, output_relevance], axis=1)
        return output
    
    def query(self, query, k=3):
        output = self.invoke(query, k)
        output = [Output(content=x[0], relevance=x[1]) for x in output]
        return output
    
    def preprocess_text(self, text):
        text = text.lower()
        text = re.sub(r'\d+', '', text)
        text = re.sub(r'\b\w{1,2}\b', '', text)  
        text = re.sub(r'[^\w\s]', '', text) 
        text = ' '.join(word for word in text.split() if word not in stop_words)
        return text
    
    def save_model(self, filename='../vectorstore/tfidf_model.pkl'):
        # Salvar o modelo TF-IDF em um arquivo .pkl
        with open(filename, 'wb') as f_out:
            pickle.dump(self.vectorizer, f_out)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gabri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [58]:
retrieve_enem = Retriever(path_csv="../vectorstore/data_playlists_enem.csv", 
                          path_model='../vectorstore/tfidf_model_enem.pkl')

retrieve_cuet = Retriever(path_csv="../vectorstore/cuet_edital.csv", 
                          path_model='../vectorstore/tfidf_model_cuet_edital.pkl')

retrieve_exames = Retriever(path_csv="../vectorstore/exames_nacionais_edital.csv", 
                            path_model='../vectorstore/tfidf_model_exames_nacionais_edital.pkl')

retrieve_exani = Retriever(path_csv="../vectorstore/exani_edital.csv", 
                           path_model='../vectorstore/tfidf_model_exani_edital.pkl')

retrieve_icfes = Retriever(path_csv="../vectorstore/icfes_edital.csv", 
                           path_model='../vectorstore/tfidf_model_icfes_edital.pkl')

retrieve_sat = Retriever(path_csv="../vectorstore/sat_edital.csv", 
                         path_model='../vectorstore/tfidf_model_sat_edital.pkl')

In [66]:
query = "Dom João VI Ele veio para cá negociando inclusive com a Inglaterra né para vocês saberem a Marinha britânica que fez a escolta da família real até o Brasil chegaram ao Brasil você vai encontrar em vários livros falando que que você teve problema ali de de desinteria problema de piolho né o pessoal teve de arreia mesmo m né piolho mas na verdade isso tudo aconteceu porque a viagem não era tão rápida e é a primeira vez que uma nobreza portuguesa viaja a sua colônia tá a colônia e a sua colônia mais Próspera pelo menos então eles reclamaram de tudo reclamaram do desconforto da viagem porque a nobreza"

retrieve_enem.query(query, k=5)[0].content

'title período joanino enem views count publish date author parabólica description unknown link video httpswwwyoutubecomwatchfcr salve salve gente tudo bem bemvindos vídeo parabólica hoje dia videoaula história brasil momento decisivo vamos falar hoje período joanino finalzinho processo colonial brasileiro quer dizer próximas aulas gente vai entrando período império brasil independente vou enrolar gente começa falando sobre período chamado período joanino período aqui brasil vai período joanino simples família real portuguesa veio residir brasil durante período vieram quê vieram porque fugindo napoleão bonaparte imperador francês bom gente fez aula aqui sobre napoleão guerras napoleônicas então assistiu sabe bem acontecendo napoleão decretou bloqueio continental inglat terra nenhuma nação poderia continuar comercializando inglaterra desobedecesse invadido tropas francesas portugal condições condições romper inglaterra porque portugal dependia economicamente inglaterra inclusive devia p

In [61]:
query = """Eligibility 
For appearing in the CUET (UG) - 2025, there is no age limit for the candidates. The candidates 
who have passed the class 12 /equivalent examination or are appearing in 2025 can appear in 
the CUET (UG) - 2025 examination. However, the candidates will be required to fulfill the age 
criteria  (if  any)  of  the  University  /  Institution  /  Organization  in  which  they  are  desirous  of 
taking admission. """

retrieve_cuet.query(query, k=5)

[Output(content='chapter eligibility eligibility appearing the cuet there age limit the candidates the candidates who have passed the class equivalent examination are appearing can appear the cuet examination however the candidates will required fulfill the age criteria any the university institution organization which they are desirous taking admission list qualifying examinations the final examination the system conducted any recognized central state board such the central board secondary education new delhi council the indian school certificate examinations new delhi', relevance=0.7555210975674028),
 Output(content='tests subject covered under cuet the list languages domain specific subjects and general aptitude test covered cuet given appendix duration test duration each test paper would minutes the xamination will conducted multiple shifts depending the number candidates and their combinations compensatory time pwbd candidates minutes each hour chapter eligibility eligibility appe

In [62]:
query = """Eligibility 
For appearing in the CUET (UG) - 2025, there is no age limit for the candidates. The candidates 
who have passed the class 12 /equivalent examination or are appearing in 2025 can appear in 
the CUET (UG) - 2025 examination. However, the candidates will be required to fulfill the age 
criteria  (if  any)  of  the  University  /  Institution  /  Organization  in  which  they  are  desirous  of 
taking admission. """

retrieve_exames.query(query, k=5)

[Output(content='norma jne instruções realização classificação reapreciação reclamação norma jne instruções realização classificação reapreciação reclamação ficha técnica título norma jne instruções realização classificação reapreciação reclamação provas exames ensino básico ensino secundário autores ana cláudia soeiro clara romano isabel rebelo maria elvira monteiro raquel dionísio ricardo patrão rui ferreira coordenação luís duque almeida capa isabel espinheira composição direçãogeral educação júri nacional exames edição maio', relevance=0.0),
 Output(content='alunos ensino secundário acesso ensino superior reclamação apenas pode incidir sobre questões objeto reapreciação quer alegadas aluno quer tendo sido alegadas mereceram alteração classificação parte professor relator norma jne instruções realização classificação reapreciação reclamação modelo alegação justificativa reclamação continuação número suposto prova processo reclamação preencher jne fundamentação pedido reclamação cont

In [63]:
query = """Eligibility 
For appearing in the CUET (UG) - 2025, there is no age limit for the candidates. The candidates 
who have passed the class 12 /equivalent examination or are appearing in 2025 can appear in 
the CUET (UG) - 2025 examination. However, the candidates will be required to fulfill the age 
criteria  (if  any)  of  the  University  /  Institution  /  Organization  in  which  they  are  desirous  of 
taking admission. """

retrieve_exani.query(query, k=5)

[Output(content='pounds located the final clause the preceding ragraph there the author questions the validity the millerurey experiment the grounds that their theory the origin life would have required methane and monia have been present essential elements the production amino acids opción argumentación amino acids incorrect according the text amino acids would the result the presence methane and ammonia early earth atmosphere respuesta correcta guía sustentante examen nacional ingreso educación superior exani subárea comprensión lectora tema leer orientarse', relevance=0.627186572128484),
 Output(content='opción argumentación arrive since incorrect the preposition used this particular context indicate where something someone particularly the case cities countries continents the other hand preposition used indicate position point place which not the case this segment the use the simple present form the verb arrive not appropriate given the time sequence the segment totally the past si

In [64]:
query = """Eligibility 
For appearing in the CUET (UG) - 2025, there is no age limit for the candidates. The candidates 
who have passed the class 12 /equivalent examination or are appearing in 2025 can appear in 
the CUET (UG) - 2025 examination. However, the candidates will be required to fulfill the age 
criteria  (if  any)  of  the  University  /  Institution  /  Organization  in  which  they  are  desirous  of 
taking admission. """

retrieve_icfes.query(query, k=5)

[Output(content='marcarla hoja respuestas presentación especificacionescontenido características desea familiarizarse con las cinco pruebas del examen saber con los tipos preguntas consulte caja herramientas cual contiene marcos referencia infografías cuadernillos preguntas ejemplos preguntas explicadas más subdirección diseño instrumentos dirección evaluación calle torre piso edificio elemento bogotá colombia wwwicfesgov líneas atención ciudadano bogotá tel pbx', relevance=0.0),
 Output(content='del texto evaluado debe seleccionar respuesta correcta cada pregunta cuatro opciones marcarla hoja respuestas con base texto evalúa conocimiento gramatical lexical ejercicio consiste elegir las palabras más adecuadas completar texto ello debe seleccionar palabra correcta decir aquella completa cada uno los espacios texto presentado cuatro opciones dadas marcarla hoja respuestas presentación especificacionescontenido características desea familiarizarse con las cinco pruebas del examen saber co

In [65]:
query = """Eligibility 
For appearing in the CUET (UG) - 2025, there is no age limit for the candidates. The candidates 
who have passed the class 12 /equivalent examination or are appearing in 2025 can appear in 
the CUET (UG) - 2025 examination. However, the candidates will be required to fulfill the age 
criteria  (if  any)  of  the  University  /  Institution  /  Organization  in  which  they  are  desirous  of 
taking admission. """

retrieve_sat.query(query, k=5)

[Output(content='into bluebook from another program application certain testing accommodations may allow exceptions this rule internet connection required start the test and submit your answers the end the test you will receive instructions from your proctor test day there internet outage there are sections the sat the first section reading and writing and the second section math each section the sat has modules parts once you leave module you cannot return bluebook has timer that will count down the minutes and seconds remaining each module you can hide the timer until the last minutes the module', relevance=0.18691485777976885),
 Output(content='section and the intent the parties that the faa will preempt all state laws the fullest extent permitted law arbitration may maintained class collective action party may bring claim only their own behalf and cannot seek relief that would affect other individuals unless all parties agree otherwise the arbitrator will not have the authority con