In [76]:
import os
import sys
import json
import pickle
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

def sort_dict_by_key(d):
    return {k: v for k, v in sorted(d.items(), key=lambda item: item[0])}

def sort_dict_by_val(d):
    return {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}

In [77]:
file = open('../data/filtered_data.pickle', 'rb')
data = pickle.load(file)
file.close()

data.head(1)

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed,cat_freq,journal_freq,date_freq
19,704.002,Patrick Roudeau,"The BABAR Collaboration, B. Aubert, et al",Measurement of the Hadronic Form Factor in D0 ...,"21 pages, 13 postscript figures, submitted to ...","Phys.Rev.D76:052005,2007",10.1103/PhysRevD.76.052005,"BABAR-PUB-07/015, SLAC-PUB-12417",hep-ex,,The shape of the hadronic form factor f+(q2)...,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2015-06-30,"[[The BABAR Collaboration, , ], [Aubert, B., ]]",17311,2.0,536


In [78]:
df = pd.read_csv("../data/CNN_Articels_clean.csv")

print(len(df))

df.drop_duplicates(subset=['Headline'], inplace=True)
df.drop_duplicates(subset=['Url'], inplace=True)

print(len(df))

37949
37857


In [79]:
df.head(2)

Unnamed: 0,Index,Author,Date published,Category,Section,Url,Headline,Description,Keywords,Second headline,Article text
0,0,"Jacopo Prisco, CNN",2021-07-15 02:46:59,news,world,https://www.cnn.com/2021/07/14/world/tusimple-...,"There's a shortage of truckers, but TuSimple t...",The e-commerce boom has exacerbated a global t...,"world, There's a shortage of truckers, but TuS...","There's a shortage of truckers, but TuSimple t...","(CNN)Right now, there's a shortage of truck d..."
1,1,"Stephanie Bailey, CNN",2021-05-12 07:52:09,news,world,https://www.cnn.com/2021/05/12/world/ironhand-...,Bioservo's robotic 'Ironhand' could protect fa...,Working in a factory can mean doing the same t...,"world, Bioservo's robotic 'Ironhand' could pro...",A robotic 'Ironhand' could protect factory wor...,(CNN)Working in a factory or warehouse can me...


In [80]:
# class CnnNewsParser:

def parse_keywords(key_str, exclude_k=[]):
    def parse_colon(kstr):
        # parse "Paris attacks: What you need to know - CNN", get "Paris attacks"
        hl = parse_by_comma[-1].split(":")[0]
        return hl
    key_list = []
    key_str = key_str.lower()
    parse_by_comma = key_str.split(", ")
    for k in parse_by_comma:
        if ":" in k:
            k = parse_colon(k)
        if k not in exclude_k:
            key_list.append(k.replace(" ", "-"))
            
    return key_list

def parse_authors(author_str):
    '''
    This function parses the authors string into a list of authors

    args: 
        - author_str: string of authors
    return: list of authors
    '''
    authors = []
    author_str = author_str.lower()
    space_to = ""
    # remove noises
    # author_str = author_str.replace(" ", space_to)
    author_str = author_str.replace("\n", "")
    parse_by_comma = author_str.split(", ")
    for i, a in enumerate(parse_by_comma):
        if 'by' in a:
            a = a.split('by')[-1]
        # parse by 'and'
        if 'and' in a:
            al = []
            for v in a.split(' and '):
                tmp = v.replace(" ", space_to)
                if len(tmp)!=0: al.append(tmp)
            a = al[:]
                    
        if type(a) is list:
            authors.extend(a)
        else:
            if len(a.replace(" ", space_to))!=0:
                authors.append(a.replace(" ", space_to))
    return authors

def parse_date(time_str):
    return time_str.split(" ")[0]

In [81]:
authors_list = []
for index, row in tqdm(df.iterrows(), total=len(df)):
    authors = parse_authors(row['Author'])
    authors_list.extend(authors)

  0%|          | 0/37857 [00:00<?, ?it/s]

In [82]:
# df['Keywords'].value_counts()

In [83]:
max_c = 10
counter = 0
keywords_list = []
for index, row in tqdm(df.iterrows(), total=len(df)):
    keys = parse_keywords(row['Keywords'])
    counter+=1
    keywords_list.extend(keys)

  0%|          | 0/37857 [00:00<?, ?it/s]

In [84]:
len(set(keywords_list))

45048

In [85]:
unique_ks = np.unique(df['Section'])

for k in unique_ks:
    if k not in keywords_list:
        print(k)

In [86]:
len(np.unique(df['Date published'].map(parse_date)))

3837

In [87]:
df['Date published'].iloc[0].split(" ")[0]

'2021-07-15'

# use parse func, count freq for each doc

In [88]:
def sum_freq_from_stats(row_parsed_lists, exclude_list=[]):
    '''
    This func is used to compute freq for col that contains a list of values. Ex: Autohrs: [a,b,c]

    args:
        - row_parsed_lists: 2d list; should be obtain from Ex: df['author'].map(parse_author)
        - exclude_list: any key that you don;t want to account for
    '''
    freqs = []
    stat_dict = {}
    
    # Loop through it once, get stats for each keys 
    for row_list in tqdm(row_parsed_lists):
        for val in row_list:
            if val not in stat_dict:
                stat_dict[val] = 0
            else:
                stat_dict[val] += 1

    # loop through 2nd times, count freq for each row
    for row_list in tqdm(row_parsed_lists):
        freq_counter = 0
        for val in row_list:
            if val.lower() not in exclude_list: 
                freq_counter += stat_dict[val]
        freqs.append(freq_counter)
    return freqs

In [89]:
# authors
doc_author_list = df['Author'].map(parse_authors)
author_freqs = sum_freq_from_stats(doc_author_list, ['cnn', 'CNN'])
df['author_freq'] = author_freqs

df.head(3)

  0%|          | 0/37857 [00:00<?, ?it/s]

  0%|          | 0/37857 [00:00<?, ?it/s]

Unnamed: 0,Index,Author,Date published,Category,Section,Url,Headline,Description,Keywords,Second headline,Article text,author_freq
0,0,"Jacopo Prisco, CNN",2021-07-15 02:46:59,news,world,https://www.cnn.com/2021/07/14/world/tusimple-...,"There's a shortage of truckers, but TuSimple t...",The e-commerce boom has exacerbated a global t...,"world, There's a shortage of truckers, but TuS...","There's a shortage of truckers, but TuSimple t...","(CNN)Right now, there's a shortage of truck d...",5
1,1,"Stephanie Bailey, CNN",2021-05-12 07:52:09,news,world,https://www.cnn.com/2021/05/12/world/ironhand-...,Bioservo's robotic 'Ironhand' could protect fa...,Working in a factory can mean doing the same t...,"world, Bioservo's robotic 'Ironhand' could pro...",A robotic 'Ironhand' could protect factory wor...,(CNN)Working in a factory or warehouse can me...,13
2,2,"Words by Stephanie Bailey, video by Zahra Jamshed",2021-06-16 02:51:30,news,asia,https://www.cnn.com/2021/06/15/asia/swarm-robo...,This swarm of robots gets smarter the more it ...,"In a Hong Kong warehouse, a swarm of autonomou...","asia, This swarm of robots gets smarter the mo...",This swarm of robots gets smarter the more it ...,"(CNN)In a Hong Kong warehouse, a swarm of aut...",19


In [90]:
# author_stats = sort_dict_by_val(author_stats)

In [91]:
# Category and section
cat_stats = dict(df['Category'].value_counts())
sec_stats = dict(df['Section'].value_counts())

cat_freqs = df['Category'].map(cat_stats)
sec_freqs = df['Section'].map(sec_stats)

df['cat_freq'] = cat_freqs
df['sec_freq'] = sec_freqs

df.head(3)

Unnamed: 0,Index,Author,Date published,Category,Section,Url,Headline,Description,Keywords,Second headline,Article text,author_freq,cat_freq,sec_freq
0,0,"Jacopo Prisco, CNN",2021-07-15 02:46:59,news,world,https://www.cnn.com/2021/07/14/world/tusimple-...,"There's a shortage of truckers, but TuSimple t...",The e-commerce boom has exacerbated a global t...,"world, There's a shortage of truckers, but TuS...","There's a shortage of truckers, but TuSimple t...","(CNN)Right now, there's a shortage of truck d...",5,18011,614
1,1,"Stephanie Bailey, CNN",2021-05-12 07:52:09,news,world,https://www.cnn.com/2021/05/12/world/ironhand-...,Bioservo's robotic 'Ironhand' could protect fa...,Working in a factory can mean doing the same t...,"world, Bioservo's robotic 'Ironhand' could pro...",A robotic 'Ironhand' could protect factory wor...,(CNN)Working in a factory or warehouse can me...,13,18011,614
2,2,"Words by Stephanie Bailey, video by Zahra Jamshed",2021-06-16 02:51:30,news,asia,https://www.cnn.com/2021/06/15/asia/swarm-robo...,This swarm of robots gets smarter the more it ...,"In a Hong Kong warehouse, a swarm of autonomou...","asia, This swarm of robots gets smarter the mo...",This swarm of robots gets smarter the more it ...,"(CNN)In a Hong Kong warehouse, a swarm of aut...",19,18011,301


In [92]:
# keywords

doc_keys_list = df['Keywords'].map(parse_keywords)
keys_freqs = sum_freq_from_stats(doc_keys_list, unique_ks)
df['keys_freq'] = keys_freqs

df.head(3)

  0%|          | 0/37857 [00:00<?, ?it/s]

  0%|          | 0/37857 [00:00<?, ?it/s]

Unnamed: 0,Index,Author,Date published,Category,Section,Url,Headline,Description,Keywords,Second headline,Article text,author_freq,cat_freq,sec_freq,keys_freq
0,0,"Jacopo Prisco, CNN",2021-07-15 02:46:59,news,world,https://www.cnn.com/2021/07/14/world/tusimple-...,"There's a shortage of truckers, but TuSimple t...",The e-commerce boom has exacerbated a global t...,"world, There's a shortage of truckers, but TuS...","There's a shortage of truckers, but TuSimple t...","(CNN)Right now, there's a shortage of truck d...",5,18011,614,0
1,1,"Stephanie Bailey, CNN",2021-05-12 07:52:09,news,world,https://www.cnn.com/2021/05/12/world/ironhand-...,Bioservo's robotic 'Ironhand' could protect fa...,Working in a factory can mean doing the same t...,"world, Bioservo's robotic 'Ironhand' could pro...",A robotic 'Ironhand' could protect factory wor...,(CNN)Working in a factory or warehouse can me...,13,18011,614,0
2,2,"Words by Stephanie Bailey, video by Zahra Jamshed",2021-06-16 02:51:30,news,asia,https://www.cnn.com/2021/06/15/asia/swarm-robo...,This swarm of robots gets smarter the more it ...,"In a Hong Kong warehouse, a swarm of autonomou...","asia, This swarm of robots gets smarter the mo...",This swarm of robots gets smarter the more it ...,"(CNN)In a Hong Kong warehouse, a swarm of aut...",19,18011,301,0


In [93]:
# sort_dict_by_val(keys_stats)

In [94]:
# Date

doc_date_list = df['Date published'].map(parse_date)
df['Date published'] = doc_date_list

date_stats = dict(df['Date published'].value_counts())
date_freqs = df['Date published'].map(date_stats)
df['date_freq'] = date_freqs

df.tail(10)

Unnamed: 0,Index,Author,Date published,Category,Section,Url,Headline,Description,Keywords,Second headline,Article text,author_freq,cat_freq,sec_freq,keys_freq,date_freq
37939,44985,CNN Sport Staff,2022-03-02,sport,sport,https://www.cnn.com/2022/03/02/sport/tom-brady...,Tom Brady: Tampa Bay Buccaneers coach Bruce Ar...,Tampa Bay Buccaneers head coach Bruce Arians s...,"sport, Tom Brady: Tampa Bay Buccaneers coach B...",Tampa Bay Buccaneers coach Bruce Arians says '...,(CNN)Tampa Bay Buccaneers head coach Bruce Ar...,20,15517,5624,15,73
37940,44986,"Sammy Mngqosini and Aleks Klosok, CNN",2022-03-02,sport,football,https://www.cnn.com/2022/03/02/football/hansjo...,Chelsea: Swiss billionaire Hansjörg Wyss wants...,Swiss billionaire Hansjörg Wyss says he wants ...,"football, Chelsea: Swiss billionaire Hansjörg ...",Swiss billionaire Hansjörg Wyss wants to buy C...,(CNN)Swiss billionaire Hansjörg Wyss says he ...,162,15517,4904,54,73
37941,44987,"Analysis by Nathan Hodge, CNN",2022-03-02,news,europe,https://www.cnn.com/2022/03/02/europe/russia-u...,Analysis: The shifting map of Ukraine makes Ru...,"Switch on Russian state television, and the sp...","europe, Analysis: The shifting map of Ukraine ...",The shifting map of Ukraine makes Russia's int...,"(CNN)Switch on Russian state television, and ...",194,18011,10587,119,73
37942,44990,"Luke McGee, CNN",2022-02-28,news,europe,https://www.cnn.com/2022/02/28/europe/putin-st...,Vladimir Putin faces stiffer opposition than e...,"Five days into Russia's invasion of Ukraine, i...","europe, Vladimir Putin faces stiffer oppositio...",Vladimir Putin is facing stiffer opposition th...,(CNN)Five days into Russia's invasion of Ukra...,346,18011,10587,0,57
37943,44991,"Ben Morse, CNN",2022-03-01,sport,sport,https://www.cnn.com/2022/03/01/sport/ja-morant...,"Ja Morant has historic night, breaks own scori...",What can't Ja Morant do?,"sport, Ja Morant has historic night, breaks ow...","Ja Morant has historic night, breaks own scori...","(CNN)What can't Ja Morant do?On Monday night,...",858,15517,5624,0,69
37944,44992,"Ben Church and Aleks Klosok, CNN",2022-03-01,sport,sport,https://www.cnn.com/2022/03/01/sport/vladimir-...,Russian President Vladimir Putin is being stri...,Russian President Vladimir Putin has been stri...,"sport, Russian President Vladimir Putin is bei...",Vladimir Putin is being stripped of his honora...,(CNN)Russian President Vladimir Putin has bee...,958,15517,5624,0,69
37945,44993,"Tamara Qiblawi, CNN",2022-03-01,news,europe,https://www.cnn.com/2022/03/01/europe/nato-ukr...,"On NATO's doorstep, a former tourist hotspot i...",A long line of men snakes out of an unassuming...,"europe, On NATO's doorstep, a former tourist h...","On NATO's doorstep, a former tourist hotspot i...","Lviv, Ukraine (CNN)A long line of men snakes o...",28,18011,10587,0,69
37946,44994,"Wayne Sterling and Steve Almasy, CNN",2022-03-01,sport,sport,https://www.cnn.com/2022/03/01/sport/mlb-deadl...,MLB is postponing Opening Day after owners and...,Major League Baseball (MLB) is postponing its ...,"sport, MLB is postponing Opening Day after own...",MLB is postponing Opening Day after owners and...,(CNN)Major League Baseball (MLB) is postponin...,433,15517,5624,0,69
37947,44996,CNN Editorial Research,2013-01-12,news,europe,https://www.cnn.com/2013/01/11/world/europe/mi...,Mikhail Gorbachev Fast Facts - CNN,"Read CNN's Fast Facts on Mikhail Gorbachev, fo...","europe, Mikhail Gorbachev Fast Facts - CNN",Mikhail Gorbachev Fast Facts,Here's a look at the life of Mikhail Gorbachev...,143,18011,10587,0,10
37948,44997,CNN Editorial Research,2013-06-27,news,europe,https://www.cnn.com/2013/06/27/world/europe/du...,Duchess Camilla Fast Facts - CNN,"Read CNN's Fast Facts on Duchess Camilla, the ...","europe, Duchess Camilla Fast Facts - CNN",Duchess Camilla Fast Facts,(CNN)Here's a look at the life of Her Royal H...,143,18011,10587,0,13


# Filter by freqs

In [95]:
# print(df['author_freq'].describe())
# print(df['author_freq'].value_counts())

In [99]:
author_freq_min = 10
cat_freq_min = 100
sec_freq_min = 100
keys_freq_min = 1

print(len(df))
filter_df = df[df['author_freq']>=author_freq_min]
filter_df = filter_df[filter_df['cat_freq']>=cat_freq_min]
filter_df = filter_df[filter_df['sec_freq']>=sec_freq_min]
filter_df = filter_df[filter_df['keys_freq']>=keys_freq_min]
print(len(filter_df))

37857
8856


In [100]:
# print(df['date_freq'].describe())
# print(df['date_freq'].value_counts())

In [101]:
# filter_df.to_csv("../data/cnn_news/filtered_dataCNN_small.csv")

# # save as pickle
# file = open('../data/cnn_news/filtered_dataCNN_small.pickle', 'wb')
# pickle.dump(filter_df, file)
# file.close()

# build graph

In [48]:
sys.path.append("../")

from utils.parse_arxiv import (
    make_keyword_id,
    parse_authors,
    parse_categories,
    parse_journal,
    parse_year,
)

from vector_graph.bipartite_graph_dict import BipartiteGraphDict

def graph_extend_node_edge(idx, target_infor, k_id_name, keyword_nodes, document_id, edges):
    '''
    This function extends the graph by adding new nodes and edges

    args:
        - idx: index of the row in the dataframe
        - target_infor: target information to be parsed (could be authors, keywords, categories, etc)
        - k_id_name: keyword name used to make keyword id
        - keyword_nodes: list of keyword nodes
        - document_id: id of the document
        - edges: list of edges
    '''

    if type(target_infor.iloc[0]) is list:
        target_infor_dim = 2
    else:
        target_infor_dim = 1

    if target_infor_dim == 1:
        keyword_ids = make_keyword_id(k_id_name, target_infor.iloc[idx])
        keyword_nodes.append(keyword_ids)
        edges.append((document_id, keyword_ids))
    elif target_infor_dim == 2:
        keyword_ids = [make_keyword_id(k_id_name, x) for x in target_infor.iloc[idx]]
        keyword_nodes.extend(keyword_ids)
        edges.extend([(document_id, k) for k in keyword_ids])
    else:
        raise NotImplementedError
    return keyword_nodes, edges

file = open('../data/filtered_data.pickle', 'rb')
df = pickle.load(file)
file.close()

print(len(df))
df.drop_duplicates(subset=['id'], inplace=True)
print(len(df))

In [50]:
# add document nodes
G = BipartiteGraphDict()
author_keywords = []
category_keywords = []
journal_keywords = []
year_keywords = []

author_edges = []
category_edges = []
journal_edges = []
year_edges = []

authors = df["authors"].map(parse_authors)
categories = df["categories"].map(parse_categories)
journals = df["journal-ref"].map(parse_journal)
years = df["update_date"].map(parse_year)
df["id"] = df["id"].astype("string")
data_ids = set(df["id"].tolist())

for idx in range(df.shape[0]):
    document_id = df["id"].iloc[idx]
    author_keywords, author_edges = graph_extend_node_edge(idx, authors, "author", author_keywords, document_id, author_edges)
    category_keywords, category_edges = graph_extend_node_edge(idx, categories, "category", category_keywords, document_id, category_edges)
    journal_keywords, journal_edges = graph_extend_node_edge(idx, journals, "journal", journal_keywords, document_id, journal_edges)
    year_keywords, year_edges = graph_extend_node_edge(idx, years, "year", year_keywords, document_id, year_edges)

author_keywords = set(author_keywords)
author_edges = set(author_edges)
category_keywords = set(category_keywords)
category_edges = set(category_edges)
journal_keywords = set(journal_keywords)
journal_edges = set(journal_edges)
year_keywords = set(year_keywords)
year_edges = set(year_edges)

G.add_data_nodes(data_ids)
G.add_keyword_nodes(author_keywords)
G.add_keyword_nodes(category_keywords)
G.add_keyword_nodes(journal_keywords)
G.add_keyword_nodes(year_keywords)
G.add_raw_edges(author_edges)
G.add_raw_edges(category_edges)
G.add_raw_edges(journal_edges)
G.add_raw_edges(year_edges)

In [51]:
authors.iloc[0]
type(authors.iloc[0]) is list

True

In [45]:
journals.iloc[0]

'Phys.Rev.D76:052005,2007'

In [33]:
print(len(journal_edges))

12925


In [103]:
filter_df.head()

Unnamed: 0,Index,Author,Date published,Category,Section,Url,Headline,Description,Keywords,Second headline,Article text,author_freq,cat_freq,sec_freq,keys_freq,date_freq
36,36,"Lisa Respers France and Chloe Melas, CNN",2022-03-20,entertainment,entertainment,https://www.cnn.com/2022/03/20/entertainment/k...,Kanye West's Grammys performance being cancele...,Kanye West has been pulled from performing at ...,"entertainment, Kanye West's Grammys performanc...",Trevor Noah had nothing to do with Kanye West'...,(CNN)Kanye West has been pulled from performi...,111,413,390,2,43
38,38,"Analysis by Lisa Respers France, CNN",2022-03-19,entertainment,entertainment,https://www.cnn.com/2022/03/19/entertainment/l...,Analysis: Amy Schumer has older millennial mal...,"In ""Life & Beth,"" Amy Schumer plays a woman wh...","entertainment, Analysis: Amy Schumer has older...",Amy Schumer reckons with older millennial mala...,A version of this story appeared in Pop Life C...,74,413,390,119,37
63,63,"Analysis by Brian Lowry, CNN",2022-02-08,entertainment,entertainment,https://www.cnn.com/2022/02/08/entertainment/a...,Analysis: Academy Award nominations: No Spider...,The Oscars are about honoring the year's best ...,"entertainment, Analysis: Academy Award nominat...","Oscars 2022: No Spider-Man, no Bond. Can strea...",(CNN)The Oscars are about honoring the year's...,95,413,390,2,54
65,65,"Ben Church, CNN",2022-03-20,sport,motorsport,https://www.cnn.com/2022/03/20/motorsport/bahr...,Bahrain Grand Prix: Ferrari dominates as Charl...,Charles Leclerc won the Bahrain Grand Prix on ...,"motorsport, Bahrain Grand Prix: Ferrari domina...",Bahrain Grand Prix: Ferrari dominates as Charl...,(CNN)Charles Leclerc won the Bahrain Grand Pr...,799,15517,1384,2,43
66,66,"Ben Morse and Amanda Davies, CNN",2022-03-19,sport,sport,https://www.cnn.com/2022/03/19/sport/saudi-gol...,Greg Norman: Lucrative Saudi-backed golf leagu...,The new lucrative Saudi Arabia-backed golf lea...,"sport, Greg Norman: Lucrative Saudi-backed gol...",Lucrative Saudi-backed golf league is 'new opp...,(CNN)The new lucrative Saudi Arabia-backed go...,1061,15517,5624,2,37


# CNN news parser class 

In [145]:
class CnnNewsParser:
    def __init__(self, df, id_col='Url'):
        self.df = df
        self.id_col = id_col
        self.df[self.id_col] = self.df[self.id_col].astype("string")
        # drop duplicates
        self.df.drop_duplicates(subset=[self.id_col], inplace=True)
        self.unique_ks = np.unique(df['Section'])
        self.exclude_authors = ['cnn', 'CNN']
        self.parse_func_dict = {
            'Keywords': self.parse_keywords,
            'Author': self.parse_authors,
            'Date published': self.parse_date, 
            'Section': self.parse_section,
            'Category': self.parse_categories
        }
        # check if all columns are in the df
        for k in self.parse_func_dict.keys():
            assert k in self.df.columns, f"{k} is not in the df"
        self.build_graph()

    def build_graph(self):
        self.G = BipartiteGraphDict()
        Knodes_dict = {}
        edges_dict = {}
        parse_infor_dict = {}
        for k in self.parse_func_dict.keys():
            Knodes_dict[k] = []
            edges_dict[k] = []
            parse_infor_dict[k] = self.df[k].map(self.parse_func_dict[k])

        for idx in tqdm(range(self.df.shape[0])):
            document_id = self.df[self.id_col].iloc[idx]
            for k in self.parse_func_dict.keys():
                Knodes_dict[k], edges_dict[k] = graph_extend_node_edge(idx, parse_infor_dict[k], k, Knodes_dict[k], document_id, edges_dict[k])

        self.G.add_data_nodes(set(self.df[self.id_col].tolist()))
        for k in self.parse_func_dict.keys():
            Knodes_dict[k] = set(Knodes_dict[k])
            edges_dict[k] = set(edges_dict[k])
            self.G.add_keyword_nodes(Knodes_dict[k])
            self.G.add_raw_edges(edges_dict[k])
        
        self.Knodes_dict = Knodes_dict
        self.edges_dict = edges_dict

    def parse_keywords(self, key_str):
        exclude_k = self.unique_ks
        def parse_colon(kstr):
            # parse "Paris attacks: What you need to know - CNN", get "Paris attacks"
            key_l = parse_by_comma[-1].split(":")
            if len(key_l) > 1:
                return key_l[0]
            return False
        key_list = []
        key_str = key_str.lower()
        parse_by_comma = key_str.split(", ")
        for k in parse_by_comma:
            if ":" in k:
                tmp = parse_colon(k)
                if tmp: 
                    k = tmp
                    # print(k)
            if k not in exclude_k:
                key_list.append(k.replace(" ", "-"))
        return key_list

    def parse_authors(self, author_str):
        '''
        This function parses the authors string into a list of authors

        args: 
            - author_str: string of authors
        return: list of authors
        '''
        exclude_list = self.exclude_authors
        authors = []
        author_str = author_str.lower()
        space_to = ""
        # remove noises
        # author_str = author_str.replace(" ", space_to)
        author_str = author_str.replace("\n", "")
        parse_by_comma = author_str.split(", ")
        for i, a in enumerate(parse_by_comma):
            if 'by' in a:
                a = a.split('by')[-1]
            # parse by 'and'
            if 'and' in a:
                al = []
                for v in a.split(' and '):
                    tmp = v.replace(" ", space_to)
                    if len(tmp)!=0 and (tmp not in exclude_list): 
                        al.append(tmp)
                a = al[:]
                        
            if type(a) is list:
                authors.extend(a)
            else:
                if len(a.replace(" ", space_to))!=0:
                    if a.replace(" ", space_to) in exclude_list: continue
                    authors.append(a.replace(" ", space_to))
        return authors

    def parse_date(self, time_str):
        return time_str.split(" ")[0]
    
    def parse_section(self, section_str):
        return section_str
    
    def parse_categories(self, cat_str):
        return cat_str

In [146]:
cnn_news = CnnNewsParser(filter_df)

  0%|          | 0/8856 [00:00<?, ?it/s]

In [147]:
cnn_news.edges_dict.keys()

dict_keys(['Keywords', 'Author', 'Date published', 'Section', 'Category'])

In [149]:
cnn_news.edges_dict['Section']

{('https://www.cnn.com/2018/02/11/sport/red-gerard-winter-olympics-first-us-gold-intl/index.html',
  'Section:sport'),
 ('https://www.cnn.com/2014/03/18/world/europe/chechen-rebel-killed-report/index.html',
  'Section:europe'),
 ('https://www.cnn.com/2016/04/07/golf/golf-masters-day-one-spieth-mcilroy/index.html',
  'Section:golf'),
 ('https://www.cnn.com/2021/10/27/sport/roger-goodell-nfl-wft-investigation-spt-intl/index.html',
  'Section:sport'),
 ('https://www.cnn.com/2014/03/15/world/europe/crimea-russia-mcdonalds/index.html',
  'Section:europe'),
 ('https://www.cnn.com/2015/11/06/europe/kiron-university-refugees/index.html',
  'Section:europe'),
 ('https://www.cnn.com/2014/08/07/world/europe/russia-ukraine-crisis/index.html',
  'Section:europe'),
 ('https://www.cnn.com/2018/10/01/golf/ryder-cup-usa-europe-paris-analysis-spt-intl/index.html',
  'Section:golf'),
 ('https://www.cnn.com/2015/08/11/europe/azerbaijan-journalist-killed/index.html',
  'Section:europe'),
 ('https://www.cnn