In [1]:
!pip install sense2vec sentence_transformers textwrap3  flashtext strsimpy
!pip install git+https://github.com/boudinfl/pke.git
!python -m spacy download en_core_web_sm
!pip install langchain
!pip install openai==1.3.8
!pip install pypdf==3.17.2

Collecting sense2vec
  Downloading sense2vec-2.0.2-py2.py3-none-any.whl (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.6/40.6 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting textwrap3
  Downloading textwrap3-0.9.2-py2.py3-none-any.whl (12 kB)
Collecting flashtext
  Downloading flashtext-2.7.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting strsimpy
  Downloading strsimpy-0.2.1-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.9/45.9 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux20

In [2]:
from google.colab import userdata
HF_KEY = userdata.get('HF_KEY')
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')

In [3]:
#import all the neccessary libraries
import warnings
warnings.filterwarnings("ignore")
import torch
from transformers import T5ForConditionalGeneration,T5Tokenizer
from sense2vec import Sense2Vec
from sentence_transformers import SentenceTransformer
from textwrap3 import wrap
import random
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('brown')
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.tokenize import sent_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
import string
import pke
import traceback
from flashtext import KeywordProcessor
from collections import OrderedDict
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('omw-1.4')
from strsimpy.normalized_levenshtein import NormalizedLevenshtein
import pickle
import time
import os

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [4]:
from langchain.document_loaders import WebBaseLoader
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import SequentialChain
from langchain import PromptTemplate, OpenAI, LLMChain
import re
import string

In [5]:
from transformers import T5ForConditionalGeneration,T5Tokenizer
from langchain import HuggingFaceHub, LLMChain

summary_model = HuggingFaceHub(repo_id="t5-base",
                               task="summarization",
                               model_kwargs={
                               "early_stopping":True,
                               "num_beams":3,
                               "num_return_sequences":1,
                               "no_repeat_ngram_size":2,
                               "min_length": 600,
                               "max_length":2000},
                               huggingfacehub_api_token=HF_KEY)

You're using a different task than the one specified in the repository. Be sure to know what you're doing :)


In [6]:
from langchain.prompts import ChatPromptTemplate

summary_template = ChatPromptTemplate.from_template(template="Summarize: {text}")

In [7]:
question_model = HuggingFaceHub(repo_id="ramsrigouthamg/t5_squad_v1",
                               task="text2text-generation",
                               model_kwargs={
                               "early_stopping":True,
                               "num_beams":5,
                               "num_return_sequences":1,
                               "no_repeat_ngram_size":2,
                               "max_length": 72},
                               huggingfacehub_api_token=HF_KEY)

In [8]:
question_template = ChatPromptTemplate.from_template(template="context: {context} answer: {keyword}")

In [9]:
import numpy as np

def get_questions(summarized_text):
  context = summarized_text
  keywords = get_nouns_multipartite(context)

  print(summarized_text)

  questions = []
  for keyword in keywords:
    prompt="context: {context} answer: {keyword}".format(context=context, keyword=keyword)
    question = question_model(prompt)

    distractors = get_distractors_wordnet(keyword)


    # Confere se há distratores o suficiente
    if(len(distractors) < 4):
      comp = 4 - len(distractors)
      try:
        keywords_distractors = np.random.choice(keywords, size=comp, replace=False)
      except:
        keywords_distractors = np.random.choice(keywords, size=comp)
      distractors.extend(keywords_distractors)

    random_integer = random.randint(0, 3)
    alpha_list = ['(a)','(b)','(c)','(d)']
    for d, distractor in enumerate(distractors[:4]):
        if d == random_integer:
           question = question + alpha_list[d] + keyword + "\n"
        else:
           question = question + alpha_list[d] + distractor + "\n"

    question = question + "Correct answer is : " + alpha_list[random_integer] + "\n\n"

    questions.append(question)

  return questions


def get_nouns_multipartite(content):
    out=[]
    try:
        extractor = pke.unsupervised.MultipartiteRank()
        extractor.load_document(input=content,language='en')
        pos = {'PROPN', 'NOUN', 'ADJ', 'VERB', 'ADP', 'ADV', 'DET', 'CONJ', 'NUM', 'PRON', 'X'}

        stoplist = list(string.punctuation)
        stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
        stoplist += stopwords.words('english')
        extractor.candidate_selection( pos=pos)
        extractor.candidate_weighting(alpha=1.1,
                                      threshold=0.75,
                                      method='average')
        keyphrases = extractor.get_n_best(n=15)


        for val in keyphrases:
            out.append(val[0])
    except:
        out = []

    return out


def get_distractors_wordnet(word):
    distractors=[]
    try:
      syn = wn.synsets(word,'n')[0]

      word= word.lower()
      orig_word = word
      if len(word.split())>0:
          word = word.replace(" ","_")
      hypernym = syn.hypernyms()
      if len(hypernym) == 0:
          return distractors
      for item in hypernym[0].hyponyms():
          name = item.lemmas()[0].name()
          if name == orig_word:
              continue
          name = name.replace("_"," ")
          name = " ".join(w.capitalize() for w in name.split())
          if name is not None and name not in distractors:
              distractors.append(name)
    except:
      print ("Wordnet distractors not found")
    return distractors

In [10]:
text_1 = "Automobili Lamborghini, the illustrious Italian manufacturer of luxury sports cars and SUVs, is headquartered in the picturesque Sant'Agata Bolognese. This renowned automotive institution boasts a storied legacy, and its contemporary success is firmly underpinned by a fascinating history that has seen it evolve through ownership changes, economic downturns, and groundbreaking innovations.\
Ferruccio Lamborghini, a prominent Italian industrialist with a passion for automobiles, laid the foundation for this iconic marque in 1963. His vision was audacious - to challenge the supremacy of Ferrari, the undisputed titan of Italian sports cars. Under Ferruccio's guidance, Automobili Ferruccio Lamborghini S.p.A. was established, and it immediately began making waves in the automotive world.\
One of the hallmarks of Lamborghini's early years was its distinctive rear mid-engine, rear-wheel-drive layout. This design philosophy became synonymous with Lamborghini's commitment to creating high-performance vehicles. The company's inaugural models, such as the 350 GT, arrived in the mid-1960s and showcased Lamborghini's dedication to precision engineering and uncompromising quality.\
Lamborghini's ascendancy was nothing short of meteoric during its formative decade. It consistently pushed the boundaries of automotive technology and design. However, the heady days of growth were met with a sudden downturn when the world faced the harsh realities of the 1973 global financial crisis and the subsequent oil embargo. Lamborghini, like many other automakers, grappled with plummeting sales and financial instability.\
Ownership of Lamborghini underwent multiple transitions in the wake of these challenges. The company faced bankruptcy in 1978, marking a turbulent chapter in its history. The ownership baton changed hands several times, with different entities attempting to steer the storied brand to calmer waters.\
In 1987, American automaker Chrysler Corporation took the helm at Lamborghini. The Chrysler era saw Lamborghini continue to produce remarkable vehicles like the Diablo while operating under the umbrella of a global conglomerate. However, it was not a permanent arrangement.\
In 1994, Malaysian investment group Mycom Setdco and Indonesian group V'Power Corporation acquired Lamborghini, signaling another phase of transformation for the company. These new custodians brought fresh perspectives and investment to the brand, fueling its resurgence.\
A significant turning point occurred in 1998 when Mycom Setdco and V'Power sold Lamborghini to the Volkswagen Group, which placed the Italian marque under the stewardship of its Audi division. This move brought newfound stability and resources, ensuring Lamborghini's enduring presence in the luxury sports car arena.\
Over the ensuing years, Lamborghini witnessed remarkable expansions in its product portfolio. The V10-powered Huracán captured the hearts of sports car enthusiasts with its exquisite design and formidable performance. Simultaneously, Lamborghini ventured into the SUV market with the Urus, a groundbreaking vehicle powered by a potent twin-turbo V8 engine. This diversification allowed Lamborghini to cater to a broader range of customers without compromising on its commitment to luxury and performance.\
While these successes were noteworthy, Lamborghini was not immune to the challenges posed by global economic fluctuations. In the late 2000s, during the worldwide financial crisis and the subsequent economic downturn, Lamborghini's sales experienced a significant decline, illustrating the brand's vulnerability to external economic factors.\
Despite these challenges, Lamborghini maintained its relentless pursuit of automotive excellence. The company's flagship model, the V12-powered Aventador, reached the pinnacle of automotive engineering and design before concluding its production run in 2022. However, the story does not end here. Lamborghini is set to introduce the Revuelto, a V12/electric hybrid model, in 2024, exemplifying its commitment to embracing cutting-edge technologies and pushing the boundaries of performance.\
In addition to its road car production, Lamborghini has made notable contributions to other industries. The company manufactures potent V12 engines for offshore powerboat racing, further underscoring its prowess in high-performance engineering.\
Interestingly, Lamborghini's legacy extends beyond the realm of automobiles. Ferruccio Lamborghini founded Lamborghini Trattori in 1948, a separate entity from the automobile manufacturer, which continues to produce tractors to this day.\
Lamborghini's rich history is also intertwined with the world of motorsport. In a stark contrast to his rival Enzo Ferrari, Ferruccio Lamborghini decided early on not to engage in factory-supported racing, considering it too expensive and resource-intensive. Nonetheless, Lamborghini's engineers, many of whom were passionate about racing, embarked on ambitious projects, including the development of the iconic Miura sports coupe, which possessed racing potential while being road-friendly. This project marked a pivotal moment in Lamborghini's history, showcasing its ability to create vehicles that could excel on both the track and the road.Despite Ferruccio's reluctance, Lamborghini did make some forays into motorsport. In the mid-1970s, while under the management of Georges-Henri Rossetti, Lamborghini collaborated with BMW to develop and manufacture 400 cars for BMW, a venture intended to meet Group 4 homologation requirements. However, due to financial instability and delays in development, BMW eventually took control of the project, finishing it without Lamborghini's involvement.\
Lamborghini also briefly supplied engines to Formula One teams from 1989 to 1993. Teams like Larrousse, Lotus, Ligier, Minardi, and Modena utilized Lamborghini power units during this period. Lamborghini's best result in Formula One was achieved when Aguri Suzuki finished third at the 1990 Japanese Grand Prix.\
In addition to Formula One, Lamborghini was involved in other racing series. Notably, racing versions of the Diablo were developed for the Diablo Supertrophy, a single-model racing series that ran from 1996 to 1999. The Murciélago R-GT, a production racing car, was created to compete in events like the FIA GT Championship and the American Le Mans Series in 2004, achieving notable results in its racing endeavors.\
Lamborghini's connection with motorsport reflects the brand's commitment to engineering excellence, even though it shied away from factory-backed racing for much of its history.\
Beyond the realms of automotive engineering, Lamborghini has carved a distinct niche in the world of branding. The company licenses its prestigious brand to manufacturers who produce a wide array of Lamborghini-branded consumer goods, including scale models, clothing, accessories, bags, electronics, and even laptop computers. This strategic approach has enabled Lamborghini to extend its brand reach beyond the confines of the automotive industry.\
One fascinating aspect of Lamborghini's identity is its deep connection with the world of bullfighting. In 1962, Ferruccio Lamborghini visited the ranch of Don Eduardo Miura, a renowned breeder of Spanish fighting bulls. Impressed by the majestic Miura animals, Ferruccio decided to adopt a raging bull as the emblem for his burgeoning automaker. This emblem, now iconic, symbolizes Lamborghini's passion for performance, power, and the thrill of the chase.\
Lamborghini's vehicle nomenclature also reflects this bullfighting heritage, with many models bearing the names of famous fighting bulls or bull-related themes. The Miura, named after the Miura bulls, set the precedent, and subsequent models like the Murciélago, Gallardo, and Aventador continued this tradition.\
Furthermore, Lamborghini has enthusiastically embraced emerging automotive technologies, responding to environmental concerns and changing consumer preferences. The Sian, introduced as the company's first hybrid model, showcases Lamborghini's commitment to sustainable performance. With its innovative hybrid powertrain, the Sian combines electric propulsion with a naturally aspirated V12 engine to deliver breathtaking performance while minimizing emissions.\
Looking ahead, Lamborghini has ambitious plans to produce an all-electric vehicle, aligning with the broader industry trend towards electrification. While traditionalists may lament the absence of roaring V12 engines, Lamborghini recognizes the importance of evolving with the times, ensuring that future generations of enthusiasts can experience the thrill of a Lamborghini while contributing to a more sustainable future.\
In summary, Automobili Lamborghini stands as a testament to the enduring allure of Italian craftsmanship and automotive excellence. From its audacious beginnings as a challenger to Ferrari, Lamborghini has weathered storms, embraced innovation, and left an indelible mark on the world of sports cars. Its legacy is one of design brilliance, relentless pursuit of power, and a commitment to pushing the boundaries of what's possible in the realm of high-performance automobiles. Whether through its iconic V12-powered supercars, groundbreaking hybrids, or electrifying visions of the future, Lamborghini continues to captivate the hearts of automotive enthusiasts worldwide, cementing its status as a legendary and iconic brand."


In [11]:
from langchain.chat_models import ChatOpenAI
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser

questions_schema = ResponseSchema(
    name="questions",
    description="Faça a tradução das seguintes questões do Inglês para o Português",
)

response_schemas = [questions_schema]

parser = StructuredOutputParser.from_response_schemas(response_schemas)

format_instructions = parser.get_format_instructions()

template = """
Faça a tradução do seguinte texto do Inglês para o Português.
Concentre-se em manter a precisão do contexto e do conteúdo original,
evitando adicionar ou alterar palavras que não estejam presentes no texto original.
Lembre-se de mostrar a saída como questões de multipla escolha tradicionais... ou seja, com
enunciado em cima, seguido das quatro alternativas, todas em sua linha específica.

Texto em Inglês: {input}
"""

prompt_template = PromptTemplate.from_template(template=template)

llm = ChatOpenAI(api_key=OPENAI_API_KEY, temperature=0, model="gpt-3.5-turbo")

translate_chain = LLMChain(llm=llm, prompt=prompt_template)

def format_output(raw_output):
    # Separar cada questão e suas alternativas
    questions = raw_output.split('\n\n')  # Assumindo que cada questão é separada por duas quebras de linha
    formatted_questions = []
    for q in questions:
        parts = q.split('\n')
        question = parts[0]
        alternatives = parts[1:]
        formatted_question = question + '\n' + '\n'.join(alternatives) +'\n'
        formatted_questions.append(formatted_question)
    return '\n\n'.join(formatted_questions)

def format_questions(_questions):
    questions = ""

    for question in _questions:
      questions += format_output(question)

    return questions

  warn_deprecated(


In [12]:
def limpeza(pages):
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size = 1500,
      chunk_overlap = 200,
      separators=["\n\n", "\n", " ", ""]
  )
  clean_text = text_splitter.split_documents(pages)
  return clean_text

def web_reader(caminho):
    web = WebBaseLoader(caminho)
    pages_web = web.load()

    pages_web = limpeza(pages_web)

    text = ""
    for page in pages_web:
      text += page.page_content

    return text

def text_reader(caminho):
    text = TextLoader(caminho)
    pages_text = text.load()

    pages_text = limpeza(pages_text)

    text = ""
    for page in pages_text:
      text += page.page_content

    return text

def pdf_reader(caminho):
    pdf = PyPDFLoader(caminho)
    pages_pdf = pdf.load()

    pages_pdf = limpeza(pages_pdf)

    text = ""
    for page in pages_pdf:
      text += page.page_content

    return text

In [13]:
text_mimetypes = ['txt', 'doc', 'docx', 'rtf', 'md', 'html']
audio_mimetypes = ['mp3', 'wav', 'ogg', 'flac', 'aac', 'opus']

In [14]:
def load_file(file_path):

    mimetype = file_path.split(".")[-1]

    if mimetype == "pdf":

      loaded_file = pdf_reader(file_path)

    elif mimetype in text_mimetypes:

      loaded_file = text_reader(file_path)


    return loaded_file

In [None]:
text = load_file("/content/Writing_Geography.pdf")

In [16]:
from operator import itemgetter
from langchain_core.runnables import RunnableLambda

chain = (summary_template
         | summary_model
         | RunnableLambda(get_questions)
         | RunnableLambda(format_questions)
         | translate_chain
         )

final_questions = chain.invoke({"text": text_1})

Automobili Lamborghini, the illustrious italian manufacturer of luxury sports cars and SUVs, is headquartered in the picturesque Sant'Agata Bolognese . it has a fascinating history that has seen it evolve through ownership changes, economic downturns and groundbreaking innovations. the company's rise was nothing short of meteoric during its formative decade, but it also faced bankruptcy in 1978 - marking the end of the automotive institution. in 1994, Malaysian investment group Mycom Setdco and Indonesian.  ­­.­-­n­l­  l w­s­, n gra­­t  ,- s--n-.- an h   '  "  t--s  r &  de re  ...­[­» e­...[... [[ »­­&­*­;­–­_­—­/­?­'­ "[**" ; __[_(*[-[([&["['((''[?[.[([d[][n[g[y[m[i[f[h[e[s[w[l[j[t[en[,[b[c[a[r[v[in[u[p[o'-(-'&&/-_-&-//_/[/../'_ed-f(_&_f.&'/(/& (()/ ([1&#[19[4&;[##&*&,(&)&e.,'.' ('L[)) d'ee' and f&n&o / m&( )- ( (& "&" "( (.) "' " ". "" ()'"( "e""- "l"'f'n't"s'y'i'l'h'in'a'd")s "nnel)"&l. (-)(.). ( : ( "------- --- "---(" -- "/) -- y", " (/ ",) (".(?) ("."). "... "_ "? ").? ? ().....

  warn_deprecated(


Wordnet distractors not found
Wordnet distractors not found
Wordnet distractors not found
Wordnet distractors not found
Wordnet distractors not found


In [17]:
print(final_questions["text"])

Pergunta: A Lamborghini evoluiu através de mudanças de propriedade, recessões econômicas e o quê?
(a) suvs
(b) aumento
(c) t--s
(d) inovações revolucionárias
Resposta correta: (d)

Pergunta: A história da Lamborghini viu ela evoluir através de mudanças de propriedade, inovações revolucionárias e o quê mais?
(a) recessões econômicas
(b) suvs
(c) inovações revolucionárias
(d) aumento
Resposta correta: (a)

Pergunta: Qual foi o nome do grupo de investimento que comprou a Lamborghini em 1994?
(a) suvs
(b) recessões econômicas
(c) inovações revolucionárias
(d) grupo de investimento malaio Mycom Setdco
Resposta correta: (d)

Pergunta: Além do grupo de investimento malaio Mycom Setdco, qual outro grupo se juntou à Lamborghini em 1994?
(a) indonésio
(b) altaico
(c) armênio
(d) bengali
Resposta correta: (a)

Pergunta: O que aconteceu com a Automobili Lamborghini durante sua década formativa?
(a) aumento
(b) suvs
(c) indonésio
(d) recessões econômicas
Resposta correta: (a)

Pergunta: Que tipo de

In [None]:
def format_output(raw_output):
    # Separar cada questão e suas alternativas
    questions = raw_output.split('\n\n')  # Assumindo que cada questão é separada por duas quebras de linha
    formatted_questions = []
    for q in questions:
        parts = q.split('\n')
        question = parts[0]
        alternatives = parts[1:]
        formatted_question = question + '\n' + '\n'.join(alternatives)
        formatted_questions.append(formatted_question)
    return '\n\n'.join(formatted_questions)