In [None]:
!pip install transformers

In [28]:
import re

import textract
import pandas as pd

In [2]:
text = textract.process('EUtaxonomy.pdf', method='pdfminer').decode()

In [3]:
split_text = re.split(r"\s*?\n\s*?\n\s*?", text)

In [4]:
def cleaning_text(raw_text):
    raw_text = raw_text.replace("\n", " ").replace("  ", " ").strip(" ")
    return re.sub(r'[^\w\s]', '', raw_text).strip(" ")

In [5]:
MIN_PARAGRAPH_CHARACTERS = 200  # Can adjust this value
paragraphs = []
for text_section in split_text:
    clean_text = cleaning_text(text_section)
    if len(clean_text) >= MIN_PARAGRAPH_CHARACTERS:
        paragraphs.append(clean_text)

In [6]:
df = pd.DataFrame({"paragraph": paragraphs})
df.to_csv("paragraphs.csv")

In [7]:
import pandas as pd

df = pd.read_csv("paragraphs.csv")

questions = [
    ["What fuel is used for manufacturing of chlorine?"],
    ["What metric is used for evaluating emission?"],
    ["How can carbon emission of the processes of cement clinker be reduced?"],
    ["How is the Weighted Cogeneration Threshold calculated?"],
    ["What is carbon capture and sequestration?"],
    ["What stages does CCS consist of?"],
    ["What should be the average energy consumption of a water supply system?"],
    ["What are examples of sludge treatments?"],
    ["How is the process of anaerobic digestion?"],
    ["How is reforestation defined?"],
    ["What is the threshold of emssion for inland passenger water transport?"], 
    ["What are the requirements of reporting for electricity generation from natural gas where there might be fugative emissions?"]
]

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vector_corpus = vectorizer.fit_transform(df["paragraph"])

In [9]:
from sklearn.metrics.pairwise import linear_kernel

qcontexts = []
for question in questions:
    q_v = vectorizer.transform(question)
    lk_rank = linear_kernel(q_v, vector_corpus).flatten()
    qcontexts.append((question, df["paragraph"][lk_rank.argsort()[-1]]))

In [10]:
import gensim

def read_corpus(text, tokens_only=False):
    for i, line in enumerate(text):
        tokens = gensim.utils.simple_preprocess(line)
        if tokens_only:
            yield tokens
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

corpus = list(read_corpus(df["paragraph"].values))
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
model.build_vocab(corpus)
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)
qcontext_doc2vec = []
for question in questions:
    q1 = list(read_corpus(question, tokens_only=True))
    inferred_vector = model.infer_vector(q1[0])
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    qcontext_doc2vec.append((question, df["paragraph"][sims[0][0]]))

  sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))


In [11]:
for ic,(question, context)  in enumerate(qcontexts):
    print(question[0])
    print(f"tfidf: {context}\n\ndoc2vec: {qcontext_doc2vec[ic][1]}")
    print("-"*20)

What fuel is used for manufacturing of chlorine?
tfidf: For chlorine the value corresponding to an efficient level of electricity consumption was selected as the threshold given that the main source of energy used for the production of chlorine is electricity and by improving the energy efficiency of the process as well as using low carbon electricity sources  the activity can substantially contribute to the climate change mitigation objective

doc2vec: Defined as development andor use of integrated systems ie combination of software and hardware or software applications that minimize resource consumption in other sectors of the economy these digitilasion solution are essential to ensure that other sectors of the economy  agriculture energy transport buildings  meet the eligibility criteria set for other sectors inclusion in the EU Taxonomy
--------------------
What metric is used for evaluating emission?
tfidf: The threshold metric is gCO2e and not an intensity metric such as gCO2e un

In [12]:
MODEL = "distilbert-base-uncased-distilled-squad"
TEST_SAMPLE_SIZE = 1000

In [13]:
import random
import json

with open("dev-v2.0.json") as f:
    data = json.load(f)

def get_qustion_answers_context(data):
    qac = []
    for idata in data["data"]:
        for paragraph in idata["paragraphs"]:
            for question in paragraph["qas"]:
                answers = [answer["text"] for answer in question["answers"]]
                qac.append((question["question"], answers , paragraph["context"]))
    return qac

qac = random.sample(get_qustion_answers_context(data), TEST_SAMPLE_SIZE)

In [14]:
def get_em_scores(qac, qa_model):
    score = []
    for question, answers, context in qac:
        answer = qa_model(question=question, context=context)
        if not answer and not answers:
            score.append(True)
        else:
            score.append(any([answer.lower()==ans.lower() for ans in answers]))
    return score


In [15]:
from transformers import pipeline

qamodel = pipeline("question-answering", model=MODEL, tokenizer=MODEL, device=-1)

def get_answer_pipeline(question, context):
    answer = qamodel(question=question, context=context)
    if answer["score"] < 0.6:
        return ""
    else:
        return answer["answer"].rstrip(".").rstrip(",").lstrip("(").rstrip(")").rstrip(".").strip("'").strip(":")


Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [16]:
scores = get_em_scores(qac, get_answer_pipeline)
print(sum(scores)/len(scores))

0.529


In [33]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
import numpy as np

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForQuestionAnswering.from_pretrained(MODEL)


def get_answer(question, context):
    inputs = tokenizer.encode_plus(question, 
                                   context, 
                                   add_special_tokens=True, 
                                   return_tensors="pt", 
                                   max_length=tokenizer.max_len, truncation=True)
    input_ids = inputs["input_ids"].tolist()[0]

    with torch.no_grad():
        answer_start_scores, answer_end_scores = model(**inputs)
        answer_start_scores, answer_end_scores = answer_start_scores.cpu().numpy(), answer_end_scores.cpu().numpy()
        
    answer_start = np.argmax(
        answer_start_scores
    )  # Get the most likely beginning of answer with the argmax of the score
    answer_end = np.argmax(
        answer_end_scores
    ) + 1  # Get the most likely end of answer with the argmax of the score
    
    # Normalize logits and spans to retrieve the answer
    start_ = np.exp(answer_start_scores - np.log(np.sum(np.exp(answer_start_scores), axis=-1, keepdims=True)))
    end_ = np.exp(answer_end_scores - np.log(np.sum(np.exp(answer_end_scores), axis=-1, keepdims=True)))
    score = np.mean([start_[0][answer_start], end_[0][answer_end-1]])
    
    if score > 0.9:
        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
        return answer
    else:
        return ""

In [None]:
scores = get_em_scores(qac, get_answer)
print(sum(scores)/len(scores))


In [19]:
import pandas as pd

df = pd.read_csv("paragraphs.csv")

questions = [
    ["What fuel is used for manufacturing of chlorine?"],
    ["What metric is used for evaluating emission?"],
    ["How can carbon emission of the processes of cement clinker be reduced?"],
    ["How is the Weighted Cogeneration Threshold calculated?"],
    ["What is carbon capture and sequestration?"],
    ["What stages does CCS consist of?"],
    ["What should be the average energy consumption of a water supply system?"],
    ["What are examples of sludge treatments?"],
    ["How is the process of anaerobic digestion?"],
    ["How is reforestation defined?"],
    ["What is the threshold of emssion for inland passenger water transport?"], 
    ["What are the requirements of reporting for electricity generation from natural gas where there might be fugative emissions?"]
]


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

vectorizer = TfidfVectorizer()
vector_corpus = vectorizer.fit_transform(df["paragraph"])


def get_context(question):
    q_v = vectorizer.transform(question)
    lk_rank = linear_kernel(q_v, vector_corpus).flatten()
    return df["paragraph"][lk_rank.argsort()[-1]]

In [21]:
from transformers import pipeline


MODEL = "distilbert-base-uncased-distilled-squad"
qamodel = pipeline("question-answering", model=MODEL, tokenizer=MODEL, device=-1)

def get_answer_pipeline(question, context):
    answer = qamodel(question=question, context=context)
    return answer["answer"].rstrip(".").rstrip(",").lstrip("(").rstrip(")").rstrip(".").strip("'").strip(":")


In [22]:
for question in questions:
    context = get_context(question)
    answer = get_answer_pipeline(question, context)
    print(f"{question[0]}\n\n{answer}\n\n{context}")
    print("-"*100)

What fuel is used for manufacturing of chlorine?

electricity

For chlorine the value corresponding to an efficient level of electricity consumption was selected as the threshold given that the main source of energy used for the production of chlorine is electricity and by improving the energy efficiency of the process as well as using low carbon electricity sources  the activity can substantially contribute to the climate change mitigation objective
----------------------------------------------------------------------------------------------------
What metric is used for evaluating emission?

gCO2e

The threshold metric is gCO2e and not an intensity metric such as gCO2e unit of production as this enables the Taxonomy to be applied by both those reducing emission intensity eg through efficiency while also requiring them to reduce emissions overall  the overall goal
----------------------------------------------------------------------------------------------------
How can carbon emiss