In [15]:
import torch
import PyPDF2
import nltk
from scipy.spatial.distance import cosine
from sklearn.neighbors import KDTree
import numpy as np

# Extracting PDF

In [3]:
file = r"Reports\EliLilly_2021_Environmental_ESG_report.pdf"

In [4]:
pdfReader = PyPDF2.PdfFileReader(file) 

In [5]:
n = len(pdfReader.pages)
txt = ''
for i in range(n):
    temp = pdfReader.pages[i].extractText()
    txt = txt + temp

In [6]:
#removing '\n' tokens
tokens = nltk.sent_tokenize(txt.replace('\n', ''))

# Preprocessing String

In [7]:
#giving new name for preprocess strings
#keeping only sentences with length greater than 3
pre_tokens = []
for sen in tokens:
    if len(sen.split(' ')) > 3:
        pre_tokens.append(sen)

In [8]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("climatebert/distilroberta-base-climate-f")

model = AutoModelForMaskedLM.from_pretrained("climatebert/distilroberta-base-climate-f",output_hidden_states = True)

In [9]:
model.eval()

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50500, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [10]:
# input embedding matrix
a = model.roberta.get_input_embeddings().weight

In [11]:
sentences = ['Our investments in efficiency helped us achieve a 22% reduction in the carbon dioxide emitted for each dollar of revenue we earned, compared to 2019',
             'In 2021, we increased the amount of renewable energy in our purchased electricity to 79% compared to 41% in 2020',
             'Taking Root’s CommuniTree reforestation in Nicaragua —the largest such project in the country—partners with farming families to help develop sustainable livelihoods by growing native tree species on marginal farmland. The United Nations and the European Union have used this project as a model for reforestation.']
target = []
for ids,sen in enumerate(sentences):

    marked_text = "[CLS] " + sen + " [SEP]"
    # Split the sentence into tokens.
    tokenized_text = tokenizer.tokenize(marked_text)


    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)


    #creating segment ids for the sentence
    segments_ids = [1] * len(tokenized_text)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    hidden_states = []
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
        hidden_states = outputs[-1]


    token_vecs = hidden_states[-2][0]

    # Calculate the average of all 22 token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=0)
    
    target.append(sentence_embedding)

In [12]:
corpus = []
for ids,sen in enumerate(pre_tokens):

    marked_text = "[CLS] " + sen + " [SEP]"
    # Split the sentence into tokens.
    tokenized_text = tokenizer.tokenize(marked_text)


    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)


    #creating segment ids for the sentence
    segments_ids = [1] * len(tokenized_text)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    hidden_states = []
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
        hidden_states = outputs[-1]


    token_vecs = hidden_states[-2][0]

    # Calculate the average of all 22 token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=0)
    corpus.append(sentence_embedding)

In [16]:
a = torch.stack(corpus)
all_embeddings = a
normed_embeddings = (all_embeddings.T / (all_embeddings**2).sum(axis=1) ** 0.5).T
indexer = KDTree(normed_embeddings)

In [17]:
result=[]
for emb in zip(target,sentences):
    top_10 = indexer.query(emb[0].reshape(1, -1),return_distance = False, k = 10)
#     print(top_20)
    result.append([emb[1],np.array(pre_tokens)[top_10]])
    result.append(['----------------------------------end of example--------------------------------------------------'])

In [18]:
result

[['Our investments in efficiency helped us achieve a 22% reduction in the carbon dioxide emitted for each dollar of revenue we earned, compared to 2019',
  array([['From 2012 to 2020, we achieved a 26% reduction in absolute emissions.',
          'In 2021, we reduced our energy consumption by 2.9%, and we reduced our absolute GHGemissions by 9% compared to 2020.',
          'The solar array is expected toprovide up to 15% of the site’s purchased electricity, resulting in an estimated 2,350 t onne reduction in the site’s annualcarbon footprint.',
          'Become carbon neutral in our own operations (Scope 1 and 2 emissions)Lilly strives to be carbon neutral by 2030, and we are working to drive GHG emissions reductions throughout our operations.',
          'Reducing Our Energy Use EmissionsWe continue to evaluate how to improve our energy resiliency and expand our use of r enewable electricity consistent with our goal t odiversify our energy sources and decrease our GHG emissions over