In [None]:
import os
import sys
import json
import pickle
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

def sort_dict_by_key(d):
    return {k: v for k, v in sorted(d.items(), key=lambda item: item[0])}

def sort_dict_by_val(d):
    return {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}

In [None]:
file = open('../data/filtered_data.pickle', 'rb')
data = pickle.load(file)
file.close()

data.head(1)

In [None]:
df = pd.read_csv("../data/CNN_Articels_clean.csv")

print(len(df))

df.drop_duplicates(subset=['Headline'], inplace=True)

print(len(df))

In [None]:
df.head(2)

In [None]:
def parse_keywords(key_str, exclude_k=[]):
    def parse_colon(kstr):
        # parse "Paris attacks: What you need to know - CNN", get "Paris attacks"
        hl = parse_by_comma[-1].split(":")[0]
        return hl
    key_list = []
    key_str = key_str.lower()
    parse_by_comma = key_str.split(", ")
    for k in parse_by_comma:
        if ":" in k:
            k = parse_colon(k)
        if k not in exclude_k:
            key_list.append(k.replace(" ", "-"))
            
    return key_list

def parse_authors(author_str):
    '''
    This function parses the authors string into a list of authors

    args: 
        - author_str: string of authors
    return: list of authors
    '''
    authors = []
    author_str = author_str.lower()
    space_to = ""
    # remove noises
    # author_str = author_str.replace(" ", space_to)
    author_str = author_str.replace("\n", "")
    parse_by_comma = author_str.split(", ")
    for i, a in enumerate(parse_by_comma):
        if 'by' in a:
            a = a.split('by')[-1]
        # parse by 'and'
        if 'and' in a:
            al = []
            for v in a.split(' and '):
                tmp = v.replace(" ", space_to)
                if len(tmp)!=0: al.append(tmp)
            a = al[:]
                    
        if type(a) is list:
            authors.extend(a)
        else:
            if len(a.replace(" ", space_to))!=0:
                authors.append(a.replace(" ", space_to))
    return authors

def parse_date(time_str):
    return time_str.split(" ")[0]

In [None]:
authors_list = []
for index, row in tqdm(df.iterrows(), total=len(df)):
    authors = parse_authors(row['Author'])
    authors_list.extend(authors)

In [None]:
# df['Keywords'].value_counts()

In [None]:
max_c = 10
counter = 0
keywords_list = []
for index, row in tqdm(df.iterrows(), total=len(df)):
    keys = parse_keywords(row['Keywords'])
    counter+=1
    keywords_list.extend(keys)

In [None]:
len(set(keywords_list))

In [None]:
unique_ks = np.unique(df['Section'])

for k in unique_ks:
    if k not in keywords_list:
        print(k)

In [None]:
len(np.unique(df['Date published'].map(parse_date)))

In [None]:
df['Date published'].iloc[0].split(" ")[0]

# use parse func, count freq for each doc

In [None]:
def sum_freq_from_stats(row_parsed_lists, exclude_list=[]):
    '''
    This func is used to compute freq for col that contains a list of values. Ex: Autohrs: [a,b,c]

    args:
        - row_parsed_lists: 2d list; should be obtain from Ex: df['author'].map(parse_author)
        - exclude_list: any key that you don;t want to account for
    '''
    freqs = []
    stat_dict = {}
    
    # Loop through it once, get stats for each keys 
    for row_list in tqdm(row_parsed_lists):
        for val in row_list:
            if val not in stat_dict:
                stat_dict[val] = 0
            else:
                stat_dict[val] += 1

    # loop through 2nd times, count freq for each row
    for row_list in tqdm(row_parsed_lists):
        freq_counter = 0
        for val in row_list: 
            if val not in exclude_list: 
                freq_counter += stat_dict[val]
        freqs.append(freq_counter)
    return freqs

In [None]:
# authors
doc_author_list = df['Author'].map(parse_authors)
author_freqs = sum_freq_from_stats(doc_author_list, ['cnn'])
df['author_freq'] = author_freqs

df.head(3)

In [None]:
# author_stats = sort_dict_by_val(author_stats)

In [None]:
# Category and section
cat_stats = dict(df['Category'].value_counts())
sec_stats = dict(df['Section'].value_counts())

cat_freqs = df['Category'].map(cat_stats)
sec_freqs = df['Section'].map(sec_stats)

df['cat_freq'] = cat_freqs
df['sec_freq'] = sec_freqs

df.head(3)

In [None]:
# keywords

doc_keys_list = df['Keywords'].map(parse_keywords)
keys_freqs = sum_freq_from_stats(doc_keys_list, [])
df['keys_freq'] = keys_freqs

df.head(3)

In [None]:
# sort_dict_by_val(keys_stats)

In [None]:
# Date

doc_date_list = df['Date published'].map(parse_date)
df['Date published'] = doc_date_list

date_stats = dict(df['Date published'].value_counts())
date_freqs = df['Date published'].map(date_stats)
df['date_freq'] = date_freqs

df.head(3)