In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

from sentence_transformers import InputExample
from sentence_transformers import datasets
from sentence_transformers import models, SentenceTransformer
from sentence_transformers import losses
from sentence_transformers import SentenceTransformer, util 
import torch

from tqdm.auto import tqdm
from pathlib import Path

from my_util import get_chunks, get_topk_similarity

  from .autonotebook import tqdm as notebook_tqdm


## Synthesize queries given content

In [2]:
default_query = "what are scope 1 emissions?"

In [3]:
content, metadata = get_chunks(company_name="novo_nordisk")

In [4]:
def synthesize_queries(paragraphs, num_queries=3, debug=False):
    tokenizer = T5Tokenizer.from_pretrained('BeIR/query-gen-msmarco-t5-large-v1')
    model = T5ForConditionalGeneration.from_pretrained('BeIR/query-gen-msmarco-t5-large-v1')

    # Some layers in the model behave differently during training and inference.
    # To ensure the model is running in “inference mode”, we call model.eval().
    model.eval()

    pairs = []
    file_count = 0

    # set to no_grad as we don't need to calculate gradients for back prop
    with torch.no_grad():
        # loop through each passage individually
        for p in tqdm(paragraphs):
            p = p.replace('\t', ' ')
            # create input tokens
            input_ids = tokenizer.encode(p, return_tensors='pt')
            # generate output tokens (query generation)
            outputs = model.generate(
                input_ids=input_ids,
                max_length=64,
                do_sample=True,
                top_p=0.95,
                num_return_sequences=num_queries
            )
            # decode output tokens to human-readable language
            for output in outputs:
                query = tokenizer.decode(output, skip_special_tokens=True)
                # append (query, passage) pair to pairs list, separate by \t
                pairs.append(query.replace('\t', ' ') + '\t' + p)

            # once we have 200 pairs write to file
            if len(pairs) > 150:
                # save as tsv
                with open(f'data/pairs_{file_count}.tsv', 'w', encoding='utf-8') as fp:
                    fp.write('\n'.join(pairs))
                file_count += 1
                pairs = []

                if debug:
                    print(f"Paragraph:\n{p}")
                    print("\nGenerated queries:")
                    for idx, output in enumerate(outputs):
                        query = tokenizer.decode(output, skip_special_tokens=True)
                        print(f'{idx + 1}: {query}')
                    
                    

    if pairs is not None:
        # save the final, smaller than 1024 batch
        with open(f'data/pairs_{file_count}.tsv', 'w', encoding='utf-8') as fp:
            fp.write('\n'.join(pairs))

        if debug:
            print(f"Paragraph:\n{p}")
            print("\nGenerated queries:")
            for idx, output in enumerate(outputs):
                query = tokenizer.decode(output, skip_special_tokens=True)
                print(f'{idx + 1}: {query}')

In [5]:
synthesize_queries(
    paragraphs=content, 
    num_queries=3, 
    debug=True,
)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
 16%|████████████████████▋                                                                                                                | 51/327 [03:31<18:40,  4.06s/it]

Paragraph:
Company: novo_nordisk. portfolio and US insulin portfolio by incorporating them into

according to the Sarbanes-Oxley Act by an independent audit

our ESG statement (please refer to note 8.6 on US pricing).

We have implemented a set of data and artificial intelligence

firm elected at the Annual General Meeting. As part of our

ethics principles in our Global Ethics and Compliance

ESG responsibility, we voluntarily include an Assurance Report

We continue working on implementing recommendations

Framework. These principles define Novo Nordisk’s ethical

from an independent external auditor for ESG reporting in

from the Taskforce on Climate-related Financial Disclosures

data management across the Group and aim to promote a

the Annual Report. The assurance provider reviews whether

(TCFD), taking a stepwise approach to incorporating material

sound and ethical data culture within Novo Nordisk and in

the consolidated ESG statement is accurately presented.

climate-related

 31%|█████████████████████████████████████████▏                                                                                          | 102/327 [07:21<16:49,  4.49s/it]

Paragraph:
Company: novo_nordisk. 650

cash value of up to DKK 28 billion. The total programme

550

may be reduced in size if significant business development

opportunities arise during 2023. Novo Nordisk expects

450

to conduct the majority of the new share repurchase

350

programme according to the safe harbour rules in MAR. At

Jan

Feb

Mar

Apr

May

Jun

Jul

Aug

Sep

Oct

Nov

Dec

the Annual General Meeting in March 2023, the Board of

Directors will propose a further reduction in the company’s

4. OMXC25 and pharmaceutical industry development have been rebased to Novo Nordisk share price in January 2022. 5. Abbvie, Amgen, AstraZeneca, Biogen Idec Inc, Bristol-Myers Squibb, Eli Lilly & Co., Gilead Sciences, Glaxo Smith Kline, Johnson & Johnson, Lundbeck, Merck & Co, Novartis AG, Pfizer, Roche and Sanofi-Aventis SA.

B share capital, corresponding to approximately 1.1% of the

total share capital, by cancelling 25 million treasury shares.

Share price development

From end

 44%|██████████████████████████████████████████████████████████▏                                                                         | 144/327 [10:26<12:58,  4.25s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (587 > 512). Running this sequence through the model will result in indexing errors
 47%|█████████████████████████████████████████████████████████████▊                                                                      | 153/327 [11:55<27:04,  9.34s/it]

Paragraph:
Company: novo_nordisk. Remuneration to Executive Management and Board of Directors

DKK million

2022

2021

2020

2.6 Income taxes and deferred income taxes

The deviation in foreign subsidiaries' tax rates from the Danish tax rate is mainly driven by Swiss and US business activities. Other adjustments consist of tax related to acquisitions and adjustments to prior years.

Salary and short-term incentive

141

126

119

Income taxes expensed

Pension

Benefits

Long-term incentive1

13

9

97

12

10

100

26

10

52

DKK million

Current tax on profit for the year

Deferred tax on profit for the year

2022

17,829

(3,806)

2021

13,871

(1,528)

2020

11,557

1,105

In 2020, income taxes paid in Denmark and paid outside Denmark were impacted by transfers of intellectual property rights related to acquisitions. In 2022, paid taxes related to prior years are impacted by a refund of overpaid tax from 2021.

Severance payments

—

29

Executive Management in total2

260

277


 62%|██████████████████████████████████████████████████████████████████████████████████▎                                                 | 204/327 [20:43<24:13, 11.81s/it]

Paragraph:
Company: novo_nordisk. 938

49

1,789

1. Average hedge rate for USD cash flow hedges is 696 at the end of 2022 (628 at the end of 2021) and average hedge rate for USD fair value hedges is 714 at the end of 2022 (628

at the end of 2021).

The fair value of cash flow hedges at year-end 2022, a gain of DKK 1,026 million, has been recognised in other comprehensive income.

Accounting policies On initiation of the contract, Novo Nordisk designates each derivative financial contract that qualifies for hedge accounting as one of:

The financial contracts are expected to impact the income statement within the next 12 months, with deferred gains and losses on cash flow hedges then being transferred to financial income or financial expenses. There is no expected ineffectiveness at 31 December 2022, primarily because hedging instruments match currencies of hedged cash flows.

– hedges of the fair value of a recognised asset or liability (fair value hedge) – hedges of the fair value o

 78%|██████████████████████████████████████████████████████████████████████████████████████████████████████▉                             | 255/327 [29:55<13:53, 11.58s/it]

Paragraph:
Company: novo_nordisk. Contents Introducing Novo Nordisk Strategic Aspirations Key risks Management Consolidated statements Additional information

Notes to the consolidated ESG statement

Section 6 Basis of preparation

General reporting standards and principles Novo Nordisk's annual reporting complies with the Danish Financial Statements Act. Sections 99a, 99b, 99d and 107d specify the requirements to report on the management of risks related to the environment, climate, human rights, labour and social conditions, anti-corruption, gender distribution and data ethics. These requirements are addressed in the Management review.

As recommended by the Taskforce on Climate-related Financial Disclosures (TCFD), Novo Nordisk is working to integrate two climate change scenarios into the risk management process to identify short-, medium- and long-term risks within the production and supply chain:

– Limiting temperature increase to well below 2ºC scenario, preferably 1.5ºC, compar

 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌        | 306/327 [38:14<03:29,  9.97s/it]

Paragraph:
Company: novo_nordisk. Management's responsibility Management of Novo Nordisk A/S is responsible for designing, implementing, and maintaining internal controls over information relevant to the preparation of the ESG data and information in the ESG statement, ensuring they are free from material misstatement, whether due to fraud or error. Furthermore, Management is responsible for establishing objective accounting policies for the preparation of ESG data, for the overall content of the ESG statement, and for measuring and reporting ESG data in accordance with the Basis of preparation and the ESG accounting policies.

Work performed We are required to plan and perform our work in order to consider the risk of material misstatement in the ESG statement. To do so, we have: – conducted interviews with data owners and internal stakeholders to understand the key processes and control activities for measuring, recording and reporting the ESG data;

– performed limited substantive t

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 327/327 [41:59<00:00,  7.71s/it]


Paragraph:
Company: novo_nordisk. 3. Guaranties given for subsidiaries mainly relate to guaranties towards Novo Nordisk

Finance (Netherlands) B.V. related to issuance of Eurobonds.

Novo Nordisk A/S is included in the consolidated financial statements of the Novo Nordisk Foundation.

14 Fee to statutory auditors

DKK million

2022

2021

Novo Nordisk A/S and its Danish subsidiaries are jointly taxed with the Danish companies in Novo Holdings A/S. The joint taxation also covers withholding taxes in the form of dividend tax, royalty tax and interest tax. The Danish companies are jointly and severally liable for the joint taxation. Any subsequent adjustments to income taxes and withholding taxes may lead to a larger liability. The tax for the individual companies is allocated in full on the basis of the expected taxable income.

Statutory audit1

15

8

Audit-related services

Tax advisory services

2

1

2

2

For information on pending litigation and other contingencies, please refer t

## What are the generated queries for our answer?

In [6]:
synthesize_queries(
    paragraphs=[content[251]], 
    num_queries=3, 
    debug=True,
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:12<00:00, 12.48s/it]

Paragraph:
Company: novo_nordisk. 51,951

42,138

123.3%

2020

28,565

42,138

67.8%

Contents Introducing Novo Nordisk Strategic Aspirations Key risks Management Consolidated statements Additional information

Note

2022

2021

Statement of Environmental, Social and Governance (ESG) performance

Environmental performance

Resources

Energy consumption for operations (1,000 GJ)

Share of renewable power for production sites Water consumption for production sites (1,000 m3) Breaches of environmental regulatory limit values

Emissions and waste Scope 1 emissions (1,000 tonnes CO2) Scope 2 emissions (1,000 tonnes CO2) Scope 3 emissions (1,000 tonnes CO2)1 Waste from production sites (tonnes)

7.1

7.1

7.2

7.3

7.4

7.4

7.4

7.5

3,677

100%

3,918

75

76

16

2,041

213,505

3,387

100%

3,488

12

77

16

N/A

180,806

for the year ended 31 December

Social performance

Patients

Patients reached with Novo Nordisk's Diabetes care products (estimate in millions) – Hereof reached via 




## Finetune encoder

In [9]:
def finetune_model(model_name):
    # The next step is to fine-tune a model using MNR loss.
    # We do this easily with the sentence-transformers library.
    paths = [str(path) for path in Path('data').glob('*.tsv')]
    print(paths[:5])

    pairs = []
    for path in tqdm(paths):
        with open(path, 'r', encoding='utf-8') as fp:
            lines = fp.read().split('\n')
            for line in lines:
                if '\t' not in line:
                    continue
                else:
                    q, p = line.split('\t')
                    pairs.append(InputExample(
                        texts=[q, p]
                    ))

    # We use the no duplicates data loader to avoid placing duplicate passages in the same batch,
    # as this will confuse the ranking mechanism of MNR loss.
    batch_size = 24

    loader = datasets.NoDuplicatesDataLoader(
        pairs, batch_size=batch_size
    )

    # Now we initialize the bi-encoder that we will be fine-tuning.
    # We create the transformer-to-pooler architecture using modules.
    distilbert = models.Transformer('sentence-transformers/all-MiniLM-L6-v2')
    pooler = models.Pooling(
        distilbert.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True
    )

    my_model = SentenceTransformer(modules=[distilbert, pooler])
    print(my_model)

    loss = losses.MultipleNegativesRankingLoss(my_model)

    epochs = 3
    warmup_steps = int(len(loader) * epochs * 0.1)

    my_model.fit(
        train_objectives=[(loader, loss)],
        epochs=epochs,
        warmup_steps=warmup_steps,
        output_path=model_name,
        show_progress_bar=True
    )

In [10]:
finetune_model(model_name="my_model")

['data/pairs_0.tsv', 'data/pairs_1.tsv', 'data/pairs_3.tsv', 'data/pairs_2.tsv', 'data/pairs_6.tsv']


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 343.58it/s]


SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)


Epoch:   0%|                                                                                                                                         | 0/3 [00:00<?, ?it/s]
Iteration:   0%|                                                                                                                                    | 0/34 [00:00<?, ?it/s][A
Iteration:   3%|███▋                                                                                                                        | 1/34 [00:04<02:39,  4.83s/it][A
Iteration:   6%|███████▎                                                                                                                    | 2/34 [00:09<02:33,  4.80s/it][A
Iteration:   9%|██████████▉                                                                                                                 | 3/34 [00:14<02:28,  4.80s/it][A
Iteration:  12%|██████████████▌                                                                                                 

## Does it perform better or overfit?

In [11]:
my_embedder = SentenceTransformer("my_model")

In [12]:
encoded_query = my_embedder.encode(default_query)

In [13]:
encoded_docs = my_embedder.encode(content)

In [14]:
get_topk_similarity(
    k=10, 
    encoded_query=encoded_query, 
    encoded_docs=encoded_docs, 
    is_cos_sim=True, 
    debug=True,
)

Most similar pairs:
doc_idx	 score
260 	 0.5317
22 	 0.5186
262 	 0.4860
261 	 0.4696
263 	 0.4149
23 	 0.4120
259 	 0.4119
19 	 0.4037
20 	 0.3897
265 	 0.3866


In [16]:
trained_query = my_embedder.encode(
    "which of the following categories of emissions is novo nordisk responsible for?"
)

In [17]:
get_topk_similarity(
    k=10, 
    encoded_query=trained_query, 
    encoded_docs=encoded_docs, 
    is_cos_sim=True, 
    debug=True,
)

Most similar pairs:
doc_idx	 score
262 	 0.7496
260 	 0.7436
22 	 0.7094
258 	 0.6850
251 	 0.6727
20 	 0.6562
24 	 0.6517
256 	 0.6496
261 	 0.6486
18 	 0.6458
