In [6]:
import pandas as pd
import re

In [7]:
train_data = pd.read_parquet("gs://scraped-news-article-data-null/fine-tune-summary-train.parquet")
test_data = pd.read_parquet("gs://scraped-news-article-data-null/fine-tune-summary-test.parquet")
train_data.head()

Unnamed: 0,body,summary
0,"context: Sen. Catherine Cortez Masto, D-Nev., ...",IMPOSSIBLE
1,context: Feb 22 (Reuters) - AngloGold Ashanti ...,IMPOSSIBLE
2,context: ## In this article\n\nFollow your fav...,"Check out this $1,000 TV that hides in its own..."
3,"context: GLASGOW, Nov 10 (Reuters) - The Unite...",U.S. and China unveil emissions deal in bid to...
4,"context: LONDON, Oct 11 (Reuters) - Britain re...",IMPOSSIBLE


In [8]:
unwrapped_train = train_data.copy()
unwrapped_test = test_data.copy()

In [9]:
extraction_regex = re.compile(r"context: (?P<context>(.|\n)+)\n\nquestion: (?P<question>(.|\n)+)\n\nsummarize the")

def extract_context(body):
    context = extraction_regex.search(body)
    if not context:
        return None
    return context.group("context")


def extract_question(body):
    question = extraction_regex.search(body)
    if not question:
        return None
    return question.group("question")

unwrapped_train["body"] = train_data.body.apply(extract_context)
unwrapped_train["question"] = train_data.body.apply(extract_question)
unwrapped_train = unwrapped_train.dropna()
unwrapped_train

Unnamed: 0,body,summary,question
0,"Sen. Catherine Cortez Masto, D-Nev., center, h...",IMPOSSIBLE,How much money have foreign donors and compani...
1,Feb 22 (Reuters) - AngloGold Ashanti (ANGJ.J) ...,IMPOSSIBLE,How much will AngloGold Ashanti lose as a resu...
2,## In this article\n\nFollow your favorite sto...,"Check out this $1,000 TV that hides in its own...",What is the resolution of the StanbyME Go's di...
3,"GLASGOW, Nov 10 (Reuters) - The United States ...",U.S. and China unveil emissions deal in bid to...,What specific actions are countries being aske...
4,"LONDON, Oct 11 (Reuters) - Britain reported a ...",IMPOSSIBLE,Why are people leaving the job market?
...,...,...,...
50129,AT&T in advanced talks to merge WarnerMedia wi...,AT&T in advanced talks to merge WarnerMedia wi...,What is the name of the new company?
50130,## In this article\n\nFollow your favorite sto...,IMPOSSIBLE,How will the approval of Biogen's ALS drug imp...
50131,"MOSCOW, Jan 28 (Reuters) - Russian President V...",Putin says West has not addressed key concerns...,Is Russia planning to invade Ukraine?
50132,"BRUSSELS/LONDON, June 24 (Reuters) - The poten...",EXCLUSIVE: Gas infrastructure across Europe le...,What are the sources of methane emissions in E...


In [11]:
unwrapped_train.to_parquet("gs://scraped-news-article-data-null/fine-tune-summary-train.parquet", index=False)

In [12]:
unwrapped_test["body"] = test_data.body.apply(extract_context)
unwrapped_test["question"] = test_data.body.apply(extract_question)
unwrapped_test = unwrapped_test.dropna()
unwrapped_test

Unnamed: 0,body,summary,question
0,"This spring, millionaires felt relatively pess...","Millionaires see market volatility, inflation ...",What are the top three threats to personal wea...
1,"TOKYO, Feb 7 (Reuters) - Toshiba Corp (6502.T)...","Toshiba now plans to split into two, bumps up ...",What businesses is Toshiba splitting off and s...
2,"BERLIN, Jan 19 (Reuters) - Germany's Greens pa...",Germany's Greens must squeeze coalition on cli...,What did the Greens secure in exchange for sup...
3,"PARIS/ROME, Nov 11 (Reuters) - A charity-run s...",IMPOSSIBLE,What are the potential economic and political ...
4,Feb 24 (Reuters) - European shares slid on Fri...,IMPOSSIBLE,What factors contributed to the 0.4% shrinkage...
...,...,...,...
995,European Central Bank member Jens Weidmann ann...,IMPOSSIBLE,How will Weidmann's departure affect the futur...
996,## In this article\n\nFollow your favorite sto...,Google is offering an on-campus hotel 'special...,Is Google facing any financial difficulties?
997,"INCHEON, May 2 (Reuters) - Asian finance leade...",Asian finance leaders look to improve market s...,What are the benefits of expanding the Chiang ...
998,A factory at the mobile phone plant of Rising ...,Apple supplier Foxconn cautious despite beatin...,What are the specific factors contributing to ...


In [13]:
unwrapped_test.to_parquet("gs://scraped-news-article-data-null/fine-tune-summary-test.parquet", index=False)