In [1]:
import datasets
import pandas as pd

## Dataset Download

In [None]:
dfs = []
# iterate over 2018 - 2023
for year in range(2018, 2024):
    # iteratoe over months 1 - 12
    for month in range(1, 13):
        # create YYYY-MM string
        date = f"{year}-{month:02d}"

        # get the dataset
        ds = datasets.load_dataset('RealTimeData/bbc_news_alltime', date)
        try:
            df = ds["train"].to_pandas()
        except:
            continue

        # append the data to the list
        dfs.append(df)

# concatenate all the dataframes
df = pd.concat(dfs)

# remove the authors and top_image columns
df = df.drop(columns=["authors", "top_image"])

# save the data to a new parquet file
df.to_parquet("bbc_news_alltime.parquet")

## Dataset Processing

In [139]:
df = pd.read_parquet("datasets/bbc/bbc_news_alltime.parquet")

In [140]:
# count slashes
df["count"] = df["link"].apply(lambda x: len(x.split("/")))

In [141]:
# filter out all rows where the count is not equal to 5 or 6
df = df[(df["count"] == 5) | (df["count"] == 6)]

In [145]:
from typing import List


def extract_tags(link: str) -> List[str]:
    tags: List[str] = []
    components = link.split("/")
    if len(components) == 6:
        tags.append(components[3])
        tags.append(components[4])
    if len(components) == 5:
        tags.append(components[3])

    article_name = components[-1]
    tags.extend(article_name.split("-")[:-1])
    return tags


def extract_main_tags(link: str) -> List[str]:
    tags = extract_tags(link)
    if len(tags) > 1:
        if tags[0] == "news" and tags[1] == "world":
            return tags[1:]
        if tags[0] == "news" and tags[1] == "uk":
            return tags[1:]
        if tags[0] == "news":
            return ["misc"] + tags[1:]
    
    return tags
    

In [146]:
# extract tags from links
df["tags"] = df["link"].apply(lambda x: extract_main_tags(x))

In [147]:
# count tags
df["tags_len"] = df["tags"].apply(lambda x: len(x))

In [150]:
# remove colums with tags_len <= 1
df = df[df["tags_len"] > 1]

In [152]:
df["main_tag"] = df["tags"].apply(lambda x: x[0])
df["sub_tag"] = df["tags"].apply(lambda x: x[1])

In [153]:
df[50:90]

Unnamed: 0,title,published_date,description,section,content,link,count,tags,tags_len,main_tag,sub_tag
59,Jake Livermore: West Brom say West Ham fan alt...,2018-01-03,West Brom say Jake Livermore confronted a West...,,West Brom say Jake Livermore confronted a West...,http://www.bbc.co.uk/sport/football/42558360,6,"[sport, football]",2,sport,football
61,Coronation Street's first barmaid Doreen Keogh...,2018-01-03,The Irish actress played Rovers Return's first...,Entertainment & Arts,"Actress Doreen Keogh, who played the first Cor...",http://www.bbc.co.uk/news/entertainment-arts-4...,5,"[misc, entertainment, arts]",3,misc,entertainment
62,A&E doctor: 'I feel like I'm fighting a losing...,2018-01-03,Tens of thousands of non-urgent NHS operations...,,Tens of thousands of non-urgent NHS operations...,http://www.bbc.co.uk/news/health-42553597,5,"[misc, health]",2,misc,health
63,AI early diagnosis could save heart and cancer...,2018-01-03,The systems will save billions of pounds by en...,Health,Sir John Bell believes that artificial intelli...,http://www.bbc.co.uk/news/health-42357257,5,"[misc, health]",2,misc,health
64,Triple killer Theodore Johnson admits murderin...,2018-01-03,Theodore Johnson already had convictions for k...,London,Theodore Johnson beat Angela Best with a claw ...,http://www.bbc.co.uk/news/uk-england-london-42...,5,"[uk, england, london]",3,uk,england
65,Iuliana Tudos Finsbury Park death: Man charged...,2018-01-03,"Kasim Lewis, 31, is charged with murder after ...",London,The body of Iuliana Tudos was discovered near ...,http://www.bbc.co.uk/news/uk-england-london-42...,5,"[uk, england, london]",3,uk,england
66,Berlin reels after nine inmates escape Plötzen...,2018-01-03,"Plötzensee jail ""lost"" the prisoners in four i...",Europe,Prisoners can be seen escaping from a shaft in...,http://www.bbc.co.uk/news/world-europe-42551732,5,"[world, europe]",2,world,europe
67,Manchester City 3-1 Watford - BBC Sport,2018-01-03,Runaway leaders Manchester City show no ill-ef...,,Last updated on .From the section Premier Leag...,http://www.bbc.co.uk/sport/football/42411077,6,"[sport, football]",2,sport,football
70,Hospital apologises after baby's skull cut in ...,2018-01-03,The baby died three hours after suffering a sc...,Nottingham,"Carson's mother Claire Smith, pictured with hi...",http://www.bbc.co.uk/news/uk-england-nottingha...,5,"[uk, england, nottinghamshire]",3,uk,england
71,Freed hostage Joshua Boyle faces 15 criminal c...,2018-01-03,Canadian Joshua Boyle and his family were held...,US & Canada,A Canadian man who was held hostage for five y...,http://www.bbc.co.uk/news/world-us-canada-4254...,5,"[world, us, canada]",3,world,us


In [154]:
# group by main tag and count the number of rows
df.groupby("main_tag").size()

main_tag
bbcthree       10
bitesize        1
iplayer        12
misc        25448
newsbeat       11
sounds         25
sport        6299
uk          35916
weather        25
world       18658
dtype: int64

In [155]:
# remove columns with main tags smaller than 1000
df = df.groupby("main_tag").filter(lambda x: len(x) > 1000)

In [163]:
# remove columns with sub tags smaller than 150
df = df.groupby(["main_tag", "sub_tag"]).filter(lambda x: len(x) > 150)

In [166]:
# remove columns with sub tags "in", "live", "newsbeat"
df = df[~df["sub_tag"].isin(["in", "live", "newsbeat"])]

In [156]:
df.groupby("main_tag").size()

main_tag
misc     25448
sport     6299
uk       35916
world    18658
dtype: int64

In [157]:
# group by main tag and print unique sub tags per main tag
df.groupby("main_tag")["sub_tag"].unique()

main_tag
misc     [newsbeat, education, science, entertainment, ...
sport    [football, tennis, cricket, american-football,...
uk          [england, politics, wales, scotland, northern]
world    [asia, europe, latin, middle, us, africa, aust...
Name: sub_tag, dtype: object

In [10]:
# group by main tag and sub tag and count the number of rows
df.groupby(["main_tag", "sub_tag"]).size()

main_tag  sub_tag         
misc      business             8026
          education            1175
          election              614
          entertainment        4776
          health               2897
          science              1817
          technology           1963
sport     athletics             182
          boxing                158
          cricket               500
          football             3393
          formula1              179
          rugby                 347
          tennis                597
uk        england             14475
          northern-ireland     3433
          politics             7590
          scotland             5265
          wales                5153
world     africa                918
          asia                 2776
          australia             700
          europe               6332
          latin-america         728
          middle-east          1672
          us                   5516
dtype: int64

In [9]:
# rename subtag rugby-union to rugby
df["sub_tag"] = df["sub_tag"].replace("rugby-union", "rugby")

# rename subtag middle to middle-east
df["sub_tag"] = df["sub_tag"].replace("middle", "middle-east")

# rename subtag northern to northernireland
df["sub_tag"] = df["sub_tag"].replace("northern", "northern-ireland")

# rename subtag middle to middle-east
df["sub_tag"] = df["sub_tag"].replace("middle", "middle-east")

# rename subtag latin to latin-america
df["sub_tag"] = df["sub_tag"].replace("latin", "latin-america")

In [11]:
# combine main tag and sub tag
df["tag"] = df["main_tag"] + "/" + df["sub_tag"]

In [14]:
# store the cleaned data in a new parquet file
df.to_parquet("bbc_cleaned.parquet")

In [15]:
df.head()

Unnamed: 0,title,published_date,description,section,content,link,count,tags,tags_len,main_tag,sub_tag,tag
0,Kabul attack: Guests use sheets to escape hote...,2018-01-21,Guests used sheets to climb down from balconie...,,Guests used sheets to climb down from balconie...,http://www.bbc.co.uk/news/world-asia-42764971,5,"[world, asia]",2,world,asia,world/asia
1,Rashan Charles death: Met Police officer not f...,2018-01-21,Rashan Charles died as he tried to swallow a p...,London,Rashan Charles died after being apprehended by...,http://www.bbc.co.uk/news/uk-england-london-42...,5,"[uk, england, london]",3,uk,england,uk/england
2,Marco Silva: Watford blame Everton as they sac...,2018-01-21,"Watford sack manager Marco Silva, blaming Ever...",,"Watford have sacked manager Marco Silva, blami...",http://www.bbc.co.uk/sport/football/42765881,6,"[sport, football]",2,sport,football,sport/football
3,North Korea Moranbong girl band leader heads O...,2018-01-21,The team is on a landmark visit to inspect cul...,Asia,"Hyon Song-wol, pictured centre, was the star a...",http://www.bbc.co.uk/news/world-asia-42765105,5,"[world, asia]",2,world,asia,world/asia
4,Australian Open: Kyle Edmund reaches first Gra...,2018-01-21,Britain's Kyle Edmund powers into a first Gran...,,Last updated on .From the section Tennis\n\nCo...,http://www.bbc.co.uk/sport/tennis/42764304,6,"[sport, tennis]",2,sport,tennis,sport/tennis
