In [1]:
import torch
import PyPDF2
import nltk
from scipy.spatial.distance import cosine
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [40]:
from sklearn.neighbors import KDTree

# Importing Climate Bert

In [98]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("climatebert/distilroberta-base-climate-f")

model = AutoModelForMaskedLM.from_pretrained("climatebert/distilroberta-base-climate-f",output_hidden_states = True)

In [99]:
vocab = list(tokenizer.vocab.keys())

# Removing tokens with size < 3

In [100]:
vocab = list(filter(lambda x: len(x)>4, vocab))

In [101]:
target = []
for ids,sen in enumerate(vocab):

#     marked_text = "[CLS] " + sen + " [SEP]"
    # Split the sentence into tokens.
    tokenized_text = []
    tokenized_text.append("[CLS] ")
    tokenized_text.append(sen)
    tokenized_text.append(" [SEP]")

    
    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    #creating segment ids for the sentence
    segments_ids = [1] * len(tokenized_text)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    hidden_states = []
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel

        hidden_states = outputs[-1]


    #print('Tensor shape for each layer: ', hidden_states[0].size())

    token_embeddings = torch.stack(hidden_states, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1,0,2)

    # token_embeddings.size()

    # Stores the token vectors, with shape [22 x 3,072]
    token_vecs_cat = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [12 x 768] tensor

        # Concatenate the vectors (that is, append them together) from the last 
        # four layers.
        # Each layer vector is 768 values, so `cat_vec` is length 3,072.
        sum_vec = torch.sum(token[-4:], dim=0)

        # Use `cat_vec` to represent `token`.
        token_vecs_cat.append(sum_vec)

    target.append(torch.stack(token_vecs_cat, dim = 0).mean(dim = 0))

In [102]:
a = torch.stack(target)


In [103]:
a.numpy().shape

(35873, 768)

In [104]:
all_embeddings = a
normed_embeddings = (all_embeddings.T / (all_embeddings**2).sum(axis=1) ** 0.5).T

In [105]:

#making kd tree
indexer = KDTree(normed_embeddings)

In [112]:
keywords = ['Electric vehicle',
'Solar',
'Wind',
'Hydroelectric' ,
'Nuclear',
'REC',
'Efficiency',
'Deforestation',
'Afforestation',
'carbon',
'credit',
'capture',
'sequestration',
'storage',
'Hydrogen',
'Geothermal',
'Biomass',
'Renewable', 
'Energy',
'emissions',
'reforestation',
'Decreased', 
'Reduced']

In [113]:
target = []
for ids,sen in enumerate(keywords):
    
    marked_text = "[CLS] " + sen + " [SEP]"
    # Split the sentence into tokens.
    tokenized_text = tokenizer.tokenize(marked_text)


    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)


    #creating segment ids for the sentence
    segments_ids = [1] * len(tokenized_text)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    hidden_states = []
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel

        hidden_states = outputs[-1]


    #print('Tensor shape for each layer: ', hidden_states[0].size())

    token_embeddings = torch.stack(hidden_states, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1,0,2)

    # token_embeddings.size()

    # Stores the token vectors, with shape [22 x 3,072]
    token_vecs_cat = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [12 x 768] tensor

        # Concatenate the vectors (that is, append them together) from the last 
        # four layers.
        # Each layer vector is 768 values, so `cat_vec` is length 3,072.
        sum_vec = torch.sum(token[-4:], dim=0)

        # Use `cat_vec` to represent `token`.
        token_vecs_cat.append(sum_vec)
        
    target.append(torch.stack(token_vecs_cat, dim = 0).mean(dim = 0))

In [69]:
result=[]
for emb in zip(target,keywords):
    top_20 = indexer.query(emb[0].reshape(1, -1),return_distance = False, k = 20)
#     print(top_20)
    result.append([emb[1],np.array(vocab)[top_20]])

In [70]:
result

[['Electric vehicle',
  array([['Electric', 'Battery', 'Solar', 'Tesla', 'energy', 'Motor',
          'Power', 'ĠMobility', 'Energy', 'Steel', 'Volume', 'Water',
          'Animal', 'electric', 'Environmental', 'Europe', 'Street',
          'electricity', 'Australia', 'Ġpavement']], dtype='<U128')],
 ['Solar',
  array([['Solar', 'Australia', 'Steel', 'Battery', 'Tesla', 'Power',
          'Europe', 'energy', 'ĠPower', 'Canada', 'Japan', 'Spain',
          'Energy', 'Water', 'California', 'India', 'Volume',
          'electricity', 'Electric', 'ĠMountain']], dtype='<U128')],
 ['Wind',
  array([['Solar', 'Power', 'Australia', 'Europe', 'ĠPower', 'ĠWind',
          'Atlantic', 'Spain', 'Battery', 'Steel', 'Canada', 'Water',
          'ĠMountain', 'Network', 'Volume', 'Electric', 'energy', 'Energy',
          'Japan', 'STATE']], dtype='<U128')],
 ['Hydroelectric',
  array([['Solar', 'Electric', 'Power', 'Water', 'energy', 'Energy',
          'Australia', 'electricity', 'Steel', 'electric',

# Finding Mean of the keyword vectors

In [115]:
kw_mean = torch.stack(target, dim = 0).mean(dim = 0)
top_20 = indexer.query(kw_mean.reshape(1, -1),return_distance = False, k = 20)
np.array(vocab)[top_20]

array([['Solar', 'energy', 'Energy', 'Volume', 'Water', 'Battery',
        'Environmental', 'Power', 'Australia', 'electricity',
        'Temperature', 'Climate', 'liquid', 'ĠPower', 'Network',
        'chemical', 'Electric', 'ĠWater', 'Proxy', 'Europe']],
      dtype='<U128')

In [114]:
len(target)

23

# Running BERT

In [116]:
import torch
from transformers import BertTokenizer, BertModel

In [117]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [118]:
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True,
                                  )
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [119]:
vocab = list(tokenizer.vocab.keys())
vocab = list(filter(lambda x: len(x)>4, vocab))
#also removing tokens with '[' in the begining 
vocab = list(filter(lambda x: x[0] != '[', vocab))

In [120]:
target = []
for ids,sen in enumerate(vocab):

#     marked_text = "[CLS] " + sen + " [SEP]"
    # Split the sentence into tokens.
    tokenized_text = []
    tokenized_text.append("[CLS] ")
    tokenized_text.append(sen)
    tokenized_text.append(" [SEP]")

    
    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    #creating segment ids for the sentence
    segments_ids = [1] * len(tokenized_text)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    hidden_states = []
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel

        hidden_states = outputs[-1]


    #print('Tensor shape for each layer: ', hidden_states[0].size())

    token_embeddings = torch.stack(hidden_states, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1,0,2)

    # token_embeddings.size()

    # Stores the token vectors, with shape [22 x 3,072]
    token_vecs_cat = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [12 x 768] tensor

        # Concatenate the vectors (that is, append them together) from the last 
        # four layers.
        # Each layer vector is 768 values, so `cat_vec` is length 3,072.
        sum_vec = torch.sum(token[-4:], dim=0)

        # Use `cat_vec` to represent `token`.
        token_vecs_cat.append(sum_vec)

    target.append(torch.stack(token_vecs_cat, dim = 0).mean(dim = 0))

In [121]:
a = torch.stack(target)
all_embeddings = a
normed_embeddings = (all_embeddings.T / (all_embeddings**2).sum(axis=1) ** 0.5).T
indexer = KDTree(normed_embeddings)

In [124]:
keywords = ['Electric vehicle',
'Solar',
'Wind',
'Hydroelectric' ,
'Nuclear',
'REC',
'Efficiency',
'Deforestation',
'Afforestation',
'carbon',
'credit',
'capture',
'sequestration',
'storage',
'Hydrogen',
'Geothermal',
'Biomass',
'Renewable', 
'Energy',
'emissions',
'reforestation',
'Decreased', 
'Reduced']

In [125]:
target = []
for ids,sen in enumerate(keywords):
    
    marked_text = "[CLS] " + sen + " [SEP]"
    # Split the sentence into tokens.
    tokenized_text = tokenizer.tokenize(marked_text)


    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)


    #creating segment ids for the sentence
    segments_ids = [1] * len(tokenized_text)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    hidden_states = []
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel

        hidden_states = outputs[-1]


    #print('Tensor shape for each layer: ', hidden_states[0].size())

    token_embeddings = torch.stack(hidden_states, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1,0,2)

    # token_embeddings.size()

    # Stores the token vectors, with shape [22 x 3,072]
    token_vecs_cat = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [12 x 768] tensor

        # Concatenate the vectors (that is, append them together) from the last 
        # four layers.
        # Each layer vector is 768 values, so `cat_vec` is length 3,072.
        sum_vec = torch.sum(token[-4:], dim=0)

        # Use `cat_vec` to represent `token`.
        token_vecs_cat.append(sum_vec)
        
    target.append(torch.stack(token_vecs_cat, dim = 0).mean(dim = 0))

In [89]:
result=[]
for emb in zip(target,keywords):
    top_20 = indexer.query(emb[0].reshape(1, -1),return_distance = False, k = 20)
#     print(top_20)
    result.append([emb[1],np.array(vocab)[top_20]])

In [90]:
result

[['Electric vehicle',
  array([['republic', 'electronic', 'river', '##izan', 'highway',
          'football', '##gren', 'electronics', 'reconnaissance',
          'automobile', 'sedan', 'international', 'psalm', 'scientific',
          'sheriff', 'helicopter', 'deputy', 'cooperative', 'herald',
          'railway']], dtype='<U18')],
 ['Solar',
  array([['walkway', 'parish', '##scent', '##cor', '##ckle', '##town',
          '##utter', 'elects', 'hollis', 'correctional', '##quin',
          'reconnaissance', 'airspace', 'heron', 'theatrical', 'southland',
          '##bury', '##heads', 'militia', '##hoe']], dtype='<U18')],
 ['Wind',
  array([['highway', 'tourist', 'highways', 'football', 'creeks',
          'honduras', 'wetland', 'river', 'swamps', 'kansas', 'rural',
          'ravens', '##vao', 'cyclone', 'public', 'france', 'national',
          'nearby', 'washington', 'republic']], dtype='<U18')],
 ['Hydroelectric',
  array([['##vao', '##ologist', 'provost', 'heron', '##nham', 'hectar

# Finding Mean of the keyword vectors

In [131]:
kw_mean = torch.stack(target, dim = 0).mean(dim = 0)
top_20 = indexer.query(kw_mean.reshape(1, -1),return_distance = False, k = 20)
np.array(vocab)[top_20]

array([['##vao', 'highway', 'france', 'tourist', 'heron', 'public',
        '##gren', 'football', 'correctional', '##ologist',
        'international', 'danube', 'provost', 'hectares', '##lana',
        '##neo', '##izan', 'airspace', 'paraguay', 'constabulary']],
      dtype='<U18')

# Finding contextualized word embedding

In [139]:
corpus = [['efficiency','Our investments in efficiency helped us achieve a 22% reduction in the carbon dioxide emitted for each dollar of revenue we earned, compared to 2019.'], 
          ['energy','In 2021, we increased the amount of renewable energy in our purchased electricity to 79% compared to 41% in 2020 '],
 ['reduction','Carbon emissions from onsite combustion of fuel and purchased energy in 2021 decreased by 88,000 metric tons (MT) from 2020 .This represents a 36% reduction from the previous year and   was primarily achieved through the increased procurement of renewable electricity, as discussed above, for our North American facilities .']]


In [149]:
target = []
for ids,sen in enumerate(corpus):
    
    marked_text = "[CLS] " + sen[1] + " [SEP]"
    # Split the sentence into tokens.
    tokenized_text = tokenizer.tokenize(marked_text)


    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)


    #creating segment ids for the sentence
    segments_ids = [1] * len(tokenized_text)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    hidden_states = []
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel

        hidden_states = outputs[-1]


    #print('Tensor shape for each layer: ', hidden_states[0].size())

    token_embeddings = torch.stack(hidden_states, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1,0,2)

    # token_embeddings.size()

    # Stores the token vectors, with shape [22 x 3,072]
    token_vecs_cat = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [12 x 768] tensor

        # Concatenate the vectors (that is, append them together) from the last 
        # four layers.
        # Each layer vector is 768 values, so `cat_vec` is length 3,072.
        sum_vec = torch.sum(token[-4:], dim=0)

        # Use `cat_vec` to represent `token`.
        token_vecs_cat.append(sum_vec)
        
    word_index = tokenized_text.index(sen[0])
    # Get the embedding for keyword
    word_embedding = token_vecs_cat[word_index]
    target.append(word_embedding / ((word_embedding**2).sum() ** 0.5))

In [150]:
result=[]
for emb in zip(target,corpus):
    top_20 = indexer.query(emb[0].reshape(1, -1),return_distance = False, k = 20)
#     print(top_20)
    result.append([emb[1],np.array(vocab)[top_20]])

In [151]:
result

[[['efficiency',
   'Our investments in efficiency helped us achieve a 22% reduction in the carbon dioxide emitted for each dollar of revenue we earned, compared to 2019.'],
  array([['metabolism', 'electricity', 'schools', 'leasing',
          'sociological', 'waterway', 'danube', 'holland', 'kurdistan',
          'inter', 'ville', 'infrastructure', 'washington', 'ligue',
          'freshly', 'sorbonne', 'manchester', 'terre', 'linking',
          'polynomial']], dtype='<U18')],
 [['energy',
   'In 2021, we increased the amount of renewable energy in our purchased electricity to 79% compared to 41% in 2020 '],
  array([['electricity', 'spectral', 'economic', 'methane', 'thermal',
          'petroleum', 'infrastructure', 'blazed', 'jalan', 'torch',
          'stellar', 'going', 'spaceship', 'economically', 'oliver',
          'transportation', 'getting', 'spice', 'ordnance', 'field']],
        dtype='<U18')],
 [['reduction',
   'Carbon emissions from onsite combustion of fuel and purch