# Generate Chunks from Larger Documents

This notebook is used to break down documents into smaller text sequences so that the model can consume them.

In [1]:
# Import Packages
import pandas as pd
from joblib import load

from src import get_num_tokens

In [None]:
# Import Data
filepath = "C:\\Users\\hlmq\\OneDrive - Chevron\\Desktop\\Projects\\Company Relationships\\Data\\Exxon\\"
filename = "CompanyProfileNewsData.json"

df = pd.read_json(str(filepath)+str(filename), lines=True)
df.head()

In [None]:
## Functions



## Generate Chunks

In [None]:
# Functions

def group_words(s, n):
    words = s.split()
    for i in range(0, len(words), n):
        yield ' '.join(words[i:i+n])

In [None]:

# Chunk Articles
len_words = 250

titles=[]
links=[]
dates=[]
text=[]
chunk_number=[]

for i in range(0,len(df)):

    # Determine whether to chunk an article based on length
    if (len(df['n_summary'][i]) <= len_words):
        # TODO: Keep the row for all the records with fewer words than len_words
        titles.append(df['n_title'][i])
        links.append(df['n_link'][i])
        dates.append(df['n_date_published'][i])
        text.append(df['n_summary'][i])
        chunk_number.append(i)

    else:
        # Chunk article
        article = df['n_summary'][i]
        chunked_article = list(group_words(article,len_words))

        # Keep track of attributes
        article_title = df['n_title'][i]
        article_link = df['n_link'][i]
        article_date = df['n_date_published'][i]

        for i in range(0, len(chunked_article)):
            titles.append(article_title)
            links.append(article_link)
            dates.append(article_date)
            text.append(chunked_article[i])
            chunk_number.append(i)


# Build dataframe from lists

chunked_df = pd.DataFrame()
chunked_df['n_title'] = titles
chunked_df['n_link'] = links
chunked_df['n_date_published'] = dates
chunked_df['n_summary'] = text
chunked_df['chunk_number'] = chunk_number


In [None]:
## STEPS

# Count number of tokens in each article
df['token count'] = df['n_summary'].apply(get_num_tokens)

## Output Chunks

In [None]:
OUTPUT_filepath = "C:\\Users\\hlmq\\OneDrive - Chevron\\Desktop\\Projects\\Company Relationships\\Data\\Exxon\\"
OUTPUT_filename = "CompanyProfileNewsData_chunked.csv"

df.to_csv(str(OUTPUT_filepath)+str(OUTPUT_filename), index=False)