In [39]:
import os

from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

import torch
from torch import Tensor, device

from sentence_transformers import SentenceTransformer, util
from transformers import T5Tokenizer, AutoTokenizer, AutoModel, T5EncoderModel, XLNetModel

from typing import List

from tqdm.auto import tqdm
from tqdm.autonotebook import trange

In [2]:
torch.set_printoptions(precision=2)
torch.set_printoptions(sci_mode=False)

## Plain old text

In [30]:
def get_chunks(company_name: str):
    file_folder_path = f"pdf/{company_name}"
    doc = [os.path.join(file_folder_path, file) for file in os.listdir(file_folder_path)][0]  # Only the first doc

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1500,
        chunk_overlap=200,
        add_start_index=True
    )

    loader = UnstructuredFileLoader(doc, mode="single", strategy="fast")
    chunks = loader.load_and_split(text_splitter)

    content = [f"Company: {company_name}. " + chunk.page_content for chunk in chunks]
    metadata = [chunk.metadata for chunk in chunks]

    return content, metadata

In [4]:
content, metadata = get_chunks(company_name="novo_nordisk")

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
for _idx, _content in enumerate(content):
    if "scope 1 emission" in _content.lower():
        print(_idx)

251
260
262
265


In [9]:
content[251]

"Company: novo_nordisk. 51,951\n\n42,138\n\n123.3%\n\n2020\n\n28,565\n\n42,138\n\n67.8%\n\nContents Introducing Novo Nordisk Strategic Aspirations Key risks Management Consolidated statements Additional information\n\nNote\n\n2022\n\n2021\n\nStatement of Environmental, Social and Governance (ESG) performance\n\nEnvironmental performance\n\nResources\n\nEnergy consumption for operations (1,000 GJ)\n\nShare of renewable power for production sites Water consumption for production sites (1,000 m3) Breaches of environmental regulatory limit values\n\nEmissions and waste Scope 1 emissions (1,000 tonnes CO2) Scope 2 emissions (1,000 tonnes CO2) Scope 3 emissions (1,000 tonnes CO2)1 Waste from production sites (tonnes)\n\n7.1\n\n7.1\n\n7.2\n\n7.3\n\n7.4\n\n7.4\n\n7.4\n\n7.5\n\n3,677\n\n100%\n\n3,918\n\n75\n\n76\n\n16\n\n2,041\n\n213,505\n\n3,387\n\n100%\n\n3,488\n\n12\n\n77\n\n16\n\nN/A\n\n180,806\n\nfor the year ended 31 December\n\nSocial performance\n\nPatients\n\nPatients reached with Novo

In [10]:
content[260]

'Company: novo_nordisk. In 2022, Scope 1 emissions decreased by 1% compared to 2021 due to an increase in usage of renewable energy sources as a result of two production facilities, in the US and France, having converted to using biogas. Scope 2 emissions were in line with 2021. In 2022, we have expanded our Scope 3 reporting to include all categories of emissions from the GHG protocol relevant to Novo Nordisk. The highest portion of Scope 3 emissions was in purchased goods and services and capital goods. These two categories together make up to 85% of the overall Scope 3 emissions.\n\n– Capital goods2\n\n– Fuel and energy related activities2\n\n– Upstream transportation and distribution2\n\n– Waste generated in operations2\n\n– Business travel\n\n– Employee commuting2\n\n477\n\n55\n\n123\n\n5\n\n55\n\n35\n\nN/A\n\nN/A\n\nN/A\n\nN/A\n\nN/A\n\nN/A\n\n– Downstream transportation and distribution2\n\n37\n\nN/A\n\n7.2 Water consumption for production sites\n\n– End-of-life treatment of sol

## Encoded text

In [20]:
# graph here
## input & output
## infra

In [14]:
default_embedder = SentenceTransformer(
    "sentence-transformers/msmarco-distilbert-base-tas-b", 
    cache_folder="cache"
)

.gitattributes: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 690/690 [00:00<00:00, 658kB/s]
1_Pooling/config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 190/190 [00:00<00:00, 204kB/s]
README.md: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3.99k/3.99k [00:00<00:00, 3.99MB/s]
config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 548/548 [00:00<00:00, 545kB/s]
config_sentence_transformers.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 122/122 [00:00<00:00, 130kB/s]
pytorch_model.bin: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26

In [61]:
encoded_content = default_embedder.encode(content[251])

In [62]:
encoded_content.shape

(768,)

In [63]:
encoded_content[:10]

array([-0.2132133 ,  0.11719614,  0.22745958, -0.29696494,  0.16332254,
        0.4249692 ,  0.2892638 , -0.5250201 ,  0.04183745, -0.41405323],
      dtype=float32)

### Why do we encode content in embedded space?

- Semantic search? Do we need it?!

### 1. Tokenizer

In [74]:
def show_tokens(sentence: str, tokenizer):
    
    inputs = tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')

    print("Number of tokens:", len(inputs.input_ids[0]))
        
    for input_id in inputs.input_ids[0]:
        print(input_id, "->", tokenizer.decode(input_id))

In [75]:
default_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/msmarco-distilbert-base-tas-b")

In [76]:
show_tokens(sentence=content[251], tokenizer=default_tokenizer)

Number of tokens: 375
tensor(101) -> [CLS]
tensor(2194) -> company
tensor(1024) -> :
tensor(24576) -> novo
tensor(1035) -> _
tensor(13926) -> nord
tensor(20573) -> ##isk
tensor(1012) -> .
tensor(4868) -> 51
tensor(1010) -> ,
tensor(5345) -> 95
tensor(2487) -> ##1
tensor(4413) -> 42
tensor(1010) -> ,
tensor(15028) -> 138
tensor(13138) -> 123
tensor(1012) -> .
tensor(1017) -> 3
tensor(1003) -> %
tensor(12609) -> 2020
tensor(2654) -> 28
tensor(1010) -> ,
tensor(5179) -> 56
tensor(2629) -> ##5
tensor(4413) -> 42
tensor(1010) -> ,
tensor(15028) -> 138
tensor(6163) -> 67
tensor(1012) -> .
tensor(1022) -> 8
tensor(1003) -> %
tensor(8417) -> contents
tensor(10449) -> introducing
tensor(24576) -> novo
tensor(13926) -> nord
tensor(20573) -> ##isk
tensor(6143) -> strategic
tensor(22877) -> aspirations
tensor(3145) -> key
tensor(10831) -> risks
tensor(2968) -> management
tensor(10495) -> consolidated
tensor(8635) -> statements
tensor(3176) -> additional
tensor(2592) -> information
tensor(3602) -> 

### 2. Encoder

In [31]:
def mean_pooling(model_output, attention_mask):
    # First element of model_output contains all token embeddings
    token_embeddings = model_output[0]  
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()

    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [83]:
# CLS Pooling - Take output from first token
def cls_pooling(model_output):
    return model_output.last_hidden_state[:,0]

In [34]:
def batch_to_device(batch, target_device: device):
    """
    send a pytorch batch to a device (CPU/GPU)
    """
    for key in batch:
        if isinstance(batch[key], Tensor):
            batch[key] = batch[key].to(target_device)
    return batch

In [90]:
def get_batched_embeddings(sentences: List[str], batch_size: int, tokenizer, model, device="cpu"):
    
    all_embeddings = []
    
    for start_index in trange(0, len(sentences), batch_size):
        # 1. Tokenize sentences
        batch = sentences[start_index:start_index+batch_size]
        encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
        
        encoded_input = batch_to_device(encoded_input, device)
        
        with torch.no_grad():
            # 2. Compute token embeddings -> Same toke might have different embeddings due to context.
            model_output = model(**encoded_input)
            print("Shape of embedded tokens:", model_output.last_hidden_state.shape)

            # 3. Perform pooling
            # Option 1: Mean pooling
            # embeddings = mean_pooling(model_output, encoded_input.attention_mask)
            # Option 2: CLS pooling
            embeddings = cls_pooling(model_output)
            embeddings = embeddings.detach()
            
            all_embeddings.extend(embeddings)
    
    all_embeddings = torch.stack(all_embeddings)
    
    return all_embeddings

In [91]:
default_model = AutoModel.from_pretrained("sentence-transformers/msmarco-distilbert-base-tas-b")

In [92]:
batched_embeddings = get_batched_embeddings(
    sentences=[content[251]],
    batch_size=1,
    tokenizer=default_tokenizer,
    model=default_model
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.12it/s]

Shape of embedded tokens: torch.Size([1, 375, 768])





In [93]:
batched_embeddings.shape

torch.Size([1, 768])

In [94]:
batched_embeddings[0].numpy()[:10]

array([-0.2132133 ,  0.11719614,  0.22745958, -0.29696494,  0.16332254,
        0.4249692 ,  0.2892638 , -0.5250201 ,  0.04183745, -0.41405323],
      dtype=float32)

In [95]:
util.cos_sim(encoded_content, batched_embeddings)

tensor([[1.00]])