# Generate Chunks from Larger Documents

This notebook is used to break down documents into smaller text sequences so that the model can consume them.

In [5]:
# Import Packages
import pandas as pd
from joblib import load

In [6]:
# Import Data
filepath = "C:\\Users\\hlmq\\OneDrive - Chevron\\Desktop\\Projects\\Company Relationships\\Data\\Exxon\\"
filename = "CompanyProfileNewsData.json"

df = pd.read_json(str(filepath)+str(filename), lines=True)
df.head()

Unnamed: 0,n_title,n_link,n_date_published,n_summary
0,Mini Nuclear Reactor Firm Newcleo Starts to Ra...,https://www.bnef.com/news/1127367?e=RSS:MixedFeed,2023-03-20 00:54:00Z,"Newcleo, a company developing small nuclear re..."
1,Foreign Investor Interest in India Renewables ...,https://www.bnef.com/shorts/15929?e=RSS:MixedFeed,2023-03-06 08:35:00Z,India closed $2.6 billion M&A deals in 2022For...
2,Enough Wind Power for a Major City Snarled Ami...,https://www.bnef.com/news/1123837?e=RSS:MixedFeed,2023-03-09 01:42:39Z,A cluster of seven wind turbines towering abov...
3,ExxonMobil tests advanced recycling of plastic...,https://corporate.exxonmobil.com/news/newsroom...,2021-02-25 00:00:00,ExxonMobil has completed the initial phase of ...
4,Asia Is Set to Steer Strong March Global Jet F...,https://www.bnef.com/insights/30917?e=RSS:Mixe...,2023-03-03 09:20:28Z,"With spring approaching, air travel will gathe..."


In [7]:
## Functions

def group_words(s, n):
    """
    Document Chunking methodology to break a document down into smaller pieces.

    Takes in a string and maximum size of a string and outputs multiple smaller strings.
    """
    words = s.split()
    for i in range(0, len(words), n):
        yield ' '.join(words[i:i+n])


def get_embeddings(text=None, model=None):
    if model==None:
        model = load('./model/SentBERTmodel.pkl')

    return model.encode(text)

## Generate Chunks

In [8]:

# Chunk Articles
len_words = 250

titles=[]
links=[]
dates=[]
text=[]
chunk_number=[]

for i in range(0,len(df)):

    # Determine whether to chunk an article based on length
    if (len(df['n_summary'][i]) <= len_words):
        # TODO: Keep the row for all the records with fewer words than len_words
        titles.append(df['n_title'][i])
        links.append(df['n_link'][i])
        dates.append(df['n_date_published'][i])
        text.append(df['n_summary'][i])
        chunk_number.append(i)

    else:
        # Chunk article
        article = df['n_summary'][i]
        chunked_article = list(group_words(article,len_words))

        # Keep track of attributes
        article_title = df['n_title'][i]
        article_link = df['n_link'][i]
        article_date = df['n_date_published'][i]

        for i in range(0, len(chunked_article)):
            titles.append(article_title)
            links.append(article_link)
            dates.append(article_date)
            text.append(chunked_article[i])
            chunk_number.append(i)


# Build dataframe from lists

chunked_df = pd.DataFrame()
chunked_df['n_title'] = titles
chunked_df['n_link'] = links
chunked_df['n_date_published'] = dates
chunked_df['n_summary'] = text
chunked_df['chunk_number'] = chunk_number


In [9]:
# Keep Schema of both tables the same
df['chunk_number'] = 0

## Generate Embeddings

In [10]:
# Embeddings on articles
article_embeddings = get_embeddings(df['n_summary'])
df['embeddings'] = article_embeddings.tolist()

# Embeddings on chunks
chunked_embeddings = get_embeddings(chunked_df['n_summary'])
chunked_df['embeddings'] = chunked_embeddings.tolist()

In [11]:
df.head()

Unnamed: 0,n_title,n_link,n_date_published,n_summary,chunk_number,embeddings
0,Mini Nuclear Reactor Firm Newcleo Starts to Ra...,https://www.bnef.com/news/1127367?e=RSS:MixedFeed,2023-03-20 00:54:00Z,"Newcleo, a company developing small nuclear re...",0,"[-0.03327552229166031, 0.00040942453779280186,..."
1,Foreign Investor Interest in India Renewables ...,https://www.bnef.com/shorts/15929?e=RSS:MixedFeed,2023-03-06 08:35:00Z,India closed $2.6 billion M&A deals in 2022For...,0,"[-0.007411965634673834, -0.019612805917859077,..."
2,Enough Wind Power for a Major City Snarled Ami...,https://www.bnef.com/news/1123837?e=RSS:MixedFeed,2023-03-09 01:42:39Z,A cluster of seven wind turbines towering abov...,0,"[-0.03466905653476715, 0.07593218237161636, 0...."
3,ExxonMobil tests advanced recycling of plastic...,https://corporate.exxonmobil.com/news/newsroom...,2021-02-25 00:00:00,ExxonMobil has completed the initial phase of ...,0,"[-0.016767723485827446, -0.017263615503907204,..."
4,Asia Is Set to Steer Strong March Global Jet F...,https://www.bnef.com/insights/30917?e=RSS:Mixe...,2023-03-03 09:20:28Z,"With spring approaching, air travel will gathe...",0,"[0.0570218563079834, -0.044863808900117874, 0...."


In [12]:
chunked_df.head()

Unnamed: 0,n_title,n_link,n_date_published,n_summary,chunk_number,embeddings
0,Mini Nuclear Reactor Firm Newcleo Starts to Ra...,https://www.bnef.com/news/1127367?e=RSS:MixedFeed,2023-03-20 00:54:00Z,"Newcleo, a company developing small nuclear re...",0,"[-0.03327552229166031, 0.00040942453779280186,..."
1,Foreign Investor Interest in India Renewables ...,https://www.bnef.com/shorts/15929?e=RSS:MixedFeed,2023-03-06 08:35:00Z,India closed $2.6 billion M&A deals in 2022For...,0,"[-0.007411965634673834, -0.019612805917859077,..."
2,Enough Wind Power for a Major City Snarled Ami...,https://www.bnef.com/news/1123837?e=RSS:MixedFeed,2023-03-09 01:42:39Z,A cluster of seven wind turbines towering abov...,0,"[-0.03466905653476715, 0.07593218237161636, 0...."
3,Enough Wind Power for a Major City Snarled Ami...,https://www.bnef.com/news/1123837?e=RSS:MixedFeed,2023-03-09 01:42:39Z,a deadline to get their projects completed and...,1,"[-0.07823453843593597, 0.06682262569665909, 0...."
4,Enough Wind Power for a Major City Snarled Ami...,https://www.bnef.com/news/1123837?e=RSS:MixedFeed,2023-03-09 01:42:39Z,amount of projects didn’t make the cut-off. Th...,2,"[-0.06473474949598312, 0.0613916851580143, 0.0..."


In [15]:
chunked_df['string'] = "URL="
chunked_df['n_link'] = chunked_df['string'] + chunked_df['n_link']

del chunked_df['string']

In [16]:
df['string'] = "URL="
df['n_link'] = df['string'] + df['n_link']

del df['string']

## Output Chunks

In [22]:
OUTPUT_filepath = "C:\\Users\\hlmq\\OneDrive - Chevron\\Desktop\\Projects\\Company Relationships\\Data\\Exxon\\"
OUTPUT_filename = "CompanyProfileNewsData_withChunking_withEmbedding.json"

chunked_df.to_json(str(OUTPUT_filepath)+str(OUTPUT_filename), orient='records')

In [21]:
len(chunked_df)

3000