In [1]:
import pandas as pd
import bs4 as bs
import urllib.request
from tqdm import tqdm
import numpy as np
import pickle

# load story metadata

In [17]:
url_root = 'https://www.allsides.com/'
url_page = url_root + 'story/admin?page='
tab_num = 0

# loop over pages
# there are about 100 pages as of 12/29/2020, so we set this number to be well above that
dfs = []
for tab_num in tqdm(range(100)):
    # read in the source
    try:
        source = urllib.request.urlopen(url_page + str(tab_num))
        sp = bs.BeautifulSoup(source, 'lxml')
        table = sp.table
        df = pd.read_html(str(table), encoding='utf-8', header=0)[0] # read table with no links

        # get links
        links = []
        for tag in table.find_all('a'):
            if tag.has_attr('href'):
                link = tag.get('href')
                if '/story' in link:
                    links.append(link)
            else:
                print(f'error! missing a link for {link}')
        df['url_story'] = links
        dfs.append(df)
    except:
        print(f'stopped after {tab_num} pages')
        break
        
df = pd.concat(dfs)
df.to_pickle('../data/df_links.pkl')

100%|██████████| 100/100 [02:58<00:00,  1.79s/it]


In [18]:
df

Unnamed: 0,Title of Headline Roundup,Topic,Date,url_story
0,Zelenskyy Meets Biden in Washington,World,2022-12-21,/story/world-zelenskyy-meets-biden-washington
1,What’s In The $1.7 Trillion Omnibus Spending B...,Politics,2022-12-21,/story/politics-what-s-17-trillion-omnibus-spe...
2,Twitter Aided Pentagon Accounts Used For Forei...,Media Industry,2022-12-21,/story/media-industry-twitter-aided-pentagon-a...
3,McCarthy Threatens Senate Republicans on Budge...,Politics,2022-12-21,/story/politics-mccarthy-threatens-block-senat...
4,What Did the January 6 Committee Accomplish?,Criminal Justice,2022-12-20,/story/criminal-justice-what-did-january-6-com...
...,...,...,...,...
45,Senate Blocks Trump Recess Appointments,Politics,2017-08-06,/story/senate-blocks-trump-recess-appointments
46,Trump Defends McMaster,Defense and Security,2017-08-05,/story/trump-defends-mcmaster
47,Sessions to Investigate Leaks,Justice,2017-08-04,/story/sessions-investigate-leaks
48,Mueller Assembles Grand Jury,Politics,2017-08-04,/story/mueller-assembles-grand-jury


# load news title / sources by following links

In [16]:
# add info from link
def get_info_from_url_story(url_story):
    '''add info rom url on a a story
    '''
    story = urllib.request.urlopen(url_story)
    sp_story = bs.BeautifulSoup(story, 'html.parser')

    final_results_single = []
    
    # loop over main three articles
    for div in sp_story.find_all('a', {'class': 'news-title'}):
        title = div.text
        url = div.get('href')

        news_source = div.parent.find('div', {'class':'news-source'}).text
        leaning = div.parent.find('img', {'typeof': 'foaf:Image'}).get('title').replace("AllSides Media Bias Rating: ", '')

        news_text = div.parent.find('div', {'class': 'news-body'}).find('div', {'class': 'body-contents'}).text.strip()
        final_results_single.append({
            'title': title,
            'url': url,
            'source': news_source,
            'leaning': leaning,
            'text': news_text
        })
            
    # loop for other articles
    for div in sp_story.find_all('div', {'class': 'news-title'}):
        title = div.text.strip()
        url = div.parent['href']
        news_source = div.parent.parent.find('div',{'class':'news-source'}).text
        leaning = div.parent.parent.find('img', {'typeof': 'foaf:Image'}).get('title').replace('AllSides Media Bias Rating: ', '')
        final_results_single.append({
            'title': title,
            'url': url,
            'source': news_source,
            'leaning': leaning,
            'text': ''
        })
    return final_results_single

def get_stories(df):
    '''Add list for all stories
    '''
    final_results = []
    for i in tqdm(range(df.shape[0])):
        url_story = url_root + df.iloc[i]['url_story']
        final_results_single = get_info_from_url_story(url_story)
        final_results.append(final_results_single)
    return final_results
    
df_stories = get_stories(df)
df_stories = pd.concat([pd.DataFrame(df_story) for df_story in df_stories])

100%|██████████| 50/50 [01:08<00:00,  1.36s/it]


In [None]:
for i in tqdm(range(df_stories.shape[0])):
    story = urllib.request.urlopen(df_stories.iloc[i]['url'])
    sp_story = bs.BeautifulSoup(story, 'html.parser')
    #df_stories.iloc[i].text = sp_story.find('div', {'class':'article-description'}).text.strip()
    try:
        df_stories.at[i, 'raw_url'] = sp_story.find('div', {'class':'read-more-story'}).find('a').get('href')
    except:
        continue

In [92]:
df_stories

Unnamed: 0,title,url,source,leaning,text,raw_url,full_text
0,Twitter Aided the Pentagon in Its Covert Onlin...,https://www.allsides.com/news/2022-12-21-0927/...,The Intercept,Left,Twitter executives have claimed for years that...,,
1,Twitter aided Pentagon influence operations: r...,https://www.allsides.com/news/2022-12-21-0926/...,NewsNation,Center,"A new installment of the “Twitter Files,” by r...",,
2,Twitter boosted Pentagon propaganda efforts in...,https://www.allsides.com/news/2022-12-21-0925/...,New York Post (News),Lean Right,Twitter has allowed the Defense Department to ...,,
3,She was an ABC News producer. She also was a c...,https://www.allsides.com/news/2022-12-21-0454/...,NPR (Online News),Lean Left,Television news producer Kristen Hentschel was...,,
4,"The 22 Debates That Made Us Rage, Roll Our Eye...",https://www.allsides.com/news/2022-12-20-0359/...,New York Times (Opinion),Left,OPINION Debating is what we do here at Times O...,,
...,...,...,...,...,...,...,...
2395,Kristi Noem Bans TikTok Use On South Dakota St...,https://www.allsides.com/news/2022-11-30-0555/...,The Daily Caller,Right,Republican Gov. Kristi Noem of South Dakota is...,,
2396,Uyghur Activists Want Biden Admin To Press Chi...,https://www.allsides.com/news/2022-11-29-1423/...,Washington Free Beacon,Right,Uyghur activists are demanding that the Biden ...,,
2397,Is the Biden Family’s China Scandal Warping Bi...,https://www.allsides.com/news/2022-11-29-1331/...,Andrew C. McCarthy,Right,OPINION Is there anything more infuriating tha...,,
2398,Warning from US embassy in China signals lockd...,https://www.allsides.com/news/2022-11-28-1323/...,Fox News (Online News),Right,The U.S. embassy in China released a statement...,,


In [45]:
for i in tqdm(range(df_stories.shape[0])):
    df_stories.iloc[i]['text'] = df_stories.iloc[i]['text'].replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')

100%|██████████| 2400/2400 [00:00<00:00, 11185.27it/s]


In [12]:
df_stories.to_pickle('../data/df_stories.pkl')
df_stories.to_csv('../data/temp.csv', index=False, encoding='utf_8_sig')

In [4]:
with open('../data/df_stories.pkl', 'rb') as f:
    df_stories = pickle.load(f)

In [5]:
df_stories.to_csv('../data/temp.csv', index=True, encoding='utf_8_sig')