# Independent Research, INFO 4900
## Wally Chang wsc46

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import requests
import bs4
import textblob
import pathlib
from bs4 import BeautifulSoup
import time
import os
import glob
from datetime import datetime

In [3]:
def clean_newsletters_MB(newsletters_df):
    filter_keywords = ['Share Morning Brew with your friends']

    # Define a function to remove text after the specified keyword in a single paragraph
    def remove_text_after_keyword(paragraph, keyword):
        for idx, text in enumerate(paragraph):
            if keyword in text:
                paragraph[idx] = text.split(keyword)[0]
        return paragraph

    # Apply the function to the 'paragraphs' column
    newsletters_df['paragraphs'] = newsletters_df['paragraphs'].apply(lambda x: remove_text_after_keyword(x, keyword) for keyword in filter_keywords)

    newsletters_df['title'] = newsletters_df['title'].str.replace(r'\s*\| Morning Brew', '', regex=True)

    filter_keywords = ['PRESENTED BY', 'TOGETHER WITH', 'SPONSORED BY', '\n']
    newsletters_df['paragraphs'] = newsletters_df['paragraphs'].apply(lambda x: ' '.join([paragraph for paragraph in x if all(keyword not in paragraph for keyword in filter_keywords)]))

    # insert column label for the newsletter type
    newsletters_df.insert(0, 'newsletter', 'MB')

In [4]:
url_list = [
    'https://www.morningbrew.com/daily/issues/duty-free',
    'https://www.morningbrew.com/daily/issues/spidey-senses',
    'https://www.morningbrew.com/daily/issues/the-outbreak-of-war',
    'https://www.morningbrew.com/daily/issues/regicidal-chatbot',
    'https://www.morningbrew.com/daily/issues/greased-alaska',
    'https://www.morningbrew.com/daily/issues/makin-waffles',
    'https://www.morningbrew.com/daily/issues/not-tom-hanks',
    'https://www.morningbrew.com/daily/issues/all-about-sbf',
    'https://www.morningbrew.com/daily/issues/fear-the-sphere',
    'https://www.morningbrew.com/daily/issues/Tupac',
    'https://www.morningbrew.com/daily/issues/ryde-or-die',
    'https://www.morningbrew.com/daily/issues/influencers-not-welcome',
    'https://www.morningbrew.com/daily/issues/monopoly-money',
    'https://www.morningbrew.com/daily/issues/clomp',
    'https://www.morningbrew.com/daily/issues/all-the-write-moves',
    'https://www.morningbrew.com/daily/issues/bets-are-off',
    'https://www.morningbrew.com/daily/issues/choo-choo',
    'https://www.morningbrew.com/daily/issues/shut-it-down',
    'https://www.morningbrew.com/daily/issues/house-of-mouse',
    'https://www.morningbrew.com/daily/issues/lost-and-found',
    'https://www.morningbrew.com/daily/issues/tesla-wins',
    'https://www.morningbrew.com/daily/issues/big-returns',
    'https://www.morningbrew.com/daily/issues/fury-road',
    'https://www.morningbrew.com/daily/issues/grass-is-greener',
    'https://www.morningbrew.com/daily/issues/sneezes',
    'https://www.morningbrew.com/daily/issues/taking-the-train',
    'https://www.morningbrew.com/daily/issues/tbd2',
    'https://www.morningbrew.com/daily/issues/extreme-room-makeover',
    'https://www.morningbrew.com/daily/issues/the-great-gloom',
    'https://www.morningbrew.com/daily/issues/mind-the-gap',
    'https://www.morningbrew.com/daily/issues/airbnb-crackdown',
    'https://www.morningbrew.com/daily/issues/desert-escapes',
    'https://www.morningbrew.com/daily/issues/back-to-school',
    'https://www.morningbrew.com/daily/issues/mcbroken',
    'https://www.morningbrew.com/daily/issues/landfall-',
    'https://www.morningbrew.com/daily/issues/close-encounters',
    'https://www.morningbrew.com/daily/issues/throwing-eggs',
    'https://www.morningbrew.com/daily/issues/getting-trippy',
    'https://www.morningbrew.com/daily/issues/work-drama',
    'https://www.morningbrew.com/daily/issues/mugged',
    'https://www.morningbrew.com/daily/issues/to-the-moon-india',
    'https://www.morningbrew.com/daily/issues/tbd23',
    'https://www.morningbrew.com/daily/issues/mall-madness',
    'https://www.morningbrew.com/daily/issues/hurriquake',
    'https://www.morningbrew.com/daily/issues/traffic-jams',
    'https://www.morningbrew.com/daily/issues/the-wig',
    'https://www.morningbrew.com/daily/issues/songs-of-summer',
    'https://www.morningbrew.com/daily/issues/winners-and-losers',
    'https://www.morningbrew.com/daily/issues/blindsided',
    'https://www.morningbrew.com/daily/issues/tbd1',
    'https://www.morningbrew.com/daily/issues/disney-100',
    'https://www.morningbrew.com/daily/issues/tbdf',
    'https://www.morningbrew.com/daily/issues/mom-in-space',
    'https://www.morningbrew.com/daily/issues/usd1t-in-credit-card-debt',
    'https://www.morningbrew.com/daily/issues/strike-city',
    'https://www.morningbrew.com/daily/issues/ncaa-shakeup',
    'https://www.morningbrew.com/daily/issues/love-in-the-time-of-ai',
    'https://www.morningbrew.com/daily/issues/don-t-go',
    'https://www.morningbrew.com/daily/issues/prediction-retraction',
    'https://www.morningbrew.com/daily/issues/back-from-the-beyond',
    'https://www.morningbrew.com/daily/issues/sandals-and-stocks',
    'https://www.morningbrew.com/daily/issues/narrative-violation',
    'https://www.morningbrew.com/daily/issues/no-spring-chickens',
    'https://www.morningbrew.com/daily/issues/hottest-month-ever',
    'https://www.morningbrew.com/daily/issues/insider-trading-in-the-sky',
    'https://www.morningbrew.com/daily/issues/truce',
    'https://www.morningbrew.com/daily/issues/digital-renaissance',
    'https://www.morningbrew.com/daily/issues/x-everything',
    'https://www.morningbrew.com/daily/issues/dumbed-down',
    'https://www.morningbrew.com/daily/issues/barbenheimer-is-here',
    'https://www.morningbrew.com/daily/issues/ice-cream-so-good',
    'https://www.morningbrew.com/daily/issues/daydreaming',
    'https://www.morningbrew.com/daily/issues/lightning-price-strike',
    'https://www.morningbrew.com/daily/issues/florida-man',
    'https://www.morningbrew.com/daily/issues/i-like-big-banks',
    'https://www.morningbrew.com/daily/issues/crickets',
    'https://www.morningbrew.com/daily/issues/new-epoch',
    'https://www.morningbrew.com/daily/issues/money-now',
    'https://www.morningbrew.com/daily/issues/sf-unicorns',
    'https://www.morningbrew.com/daily/issues/that-s-hot',
    'https://www.morningbrew.com/daily/issues/threads-blasts-off',
    'https://www.morningbrew.com/daily/issues/clothes-for-rent',
    'https://www.morningbrew.com/daily/issues/leave-your-legacy',
    'https://www.morningbrew.com/daily/issues/fireworks-get-fired',
    'https://www.morningbrew.com/daily/issues/no-forgiveness',
    'https://www.morningbrew.com/daily/issues/taming-aspartame',
    'https://www.morningbrew.com/daily/issues/members-only',
    'https://www.morningbrew.com/daily/issues/ryan-takes-the-wheel',
    'https://www.morningbrew.com/daily/issues/jell-o-champs',
    'https://www.morningbrew.com/daily/issues/what-just-happened',
    'https://www.morningbrew.com/daily/issues/tbd',
    'https://www.morningbrew.com/daily/issues/the-search-is-over',
    'https://www.morningbrew.com/daily/issues/tricking-and-trapping',
    'https://www.morningbrew.com/daily/issues/sign-of-life',
    'https://www.morningbrew.com/daily/issues/pub-problems',
    'https://www.morningbrew.com/daily/issues/grimacing',
    'https://www.morningbrew.com/daily/issues/selling-faces',
    'https://www.morningbrew.com/daily/issues/dethroned-beers',
    'https://www.morningbrew.com/daily/issues/new-beatles-track',
    'https://www.morningbrew.com/daily/issues/many-pennies',
    'https://www.morningbrew.com/daily/issues/reddit-revolt',
    'https://www.morningbrew.com/daily/issues/supercharged',
    'https://www.morningbrew.com/daily/issues/we-want-you-back',
    'https://www.morningbrew.com/daily/issues/smoke-signals',
    'https://www.morningbrew.com/daily/issues/shock-golf-merger',
    'https://www.morningbrew.com/daily/issues/mixed-reality',
    'https://www.morningbrew.com/daily/issues/sonic-boom',
    'https://www.morningbrew.com/daily/issues/so-hot-right-now',
    'https://www.morningbrew.com/daily/issues/we-are-so-back',
    'https://www.morningbrew.com/daily/issues/not-all-rainbows',
    'https://www.morningbrew.com/daily/issues/extinction-event',
    'https://www.morningbrew.com/daily/issues/raise-the-roof',
    'https://www.morningbrew.com/daily/issues/big-deal',
    'https://www.morningbrew.com/daily/issues/the-wedding-issue',
    'https://www.morningbrew.com/daily/issues/they-found-shaq',
    'https://www.morningbrew.com/daily/issues/abercrombie-is-back',
    'https://www.morningbrew.com/daily/issues/shady',
    'https://www.morningbrew.com/daily/issues/upping-the-river',
    'https://www.morningbrew.com/daily/issues/crunch-time',
    'https://www.morningbrew.com/daily/issues/potentially-explosive',
    'https://www.morningbrew.com/daily/issues/buffapenolte-ranch',
    'https://www.morningbrew.com/daily/issues/royal-hunt',
    'https://www.morningbrew.com/daily/issues/t-co-t-esday',
    'https://www.morningbrew.com/daily/issues/big-dreams',
    'https://www.morningbrew.com/daily/issues/rockin-the-suburbs',
    'https://www.morningbrew.com/daily/issues/new-chief-twit',
    'https://www.morningbrew.com/daily/issues/satisfaction',
    'https://www.morningbrew.com/daily/issues/crimes-of-dishonesty',
    'https://www.morningbrew.com/daily/issues/pulling-the-strings',
    'https://www.morningbrew.com/daily/issues/not-so-trusting',
    'https://www.morningbrew.com/daily/issues/the-world-according-to-buffett',
    'https://www.morningbrew.com/daily/issues/lucky-in-kentucky',
    'https://www.morningbrew.com/daily/issues/king-me',
    'https://www.morningbrew.com/daily/issues/round-ten',
    'https://www.morningbrew.com/daily/issues/study-buddy',
    'https://www.morningbrew.com/daily/issues/crystal-ball',
    'https://www.morningbrew.com/daily/issues/first-republic-fails',
    'https://www.morningbrew.com/daily/issues/fill-the-beyond',
    'https://www.morningbrew.com/daily/issues/jerry-and-judy',
    'https://www.morningbrew.com/daily/issues/disney-goes-big',
    'https://www.morningbrew.com/daily/issues/ai-2024',
    'https://www.morningbrew.com/daily/issues/two-stars',
    'https://www.morningbrew.com/daily/issues/bed-bath-and-bankrupt',
    'https://www.morningbrew.com/daily/issues/it-s-melting',
    'https://www.morningbrew.com/daily/issues/wake-up-in-vegas',
    'https://www.morningbrew.com/daily/issues/not-so-cool-ceos',
    'https://www.morningbrew.com/daily/issues/it-s-a-deal',
    'https://www.morningbrew.com/daily/issues/viral-beats',
    'https://www.morningbrew.com/daily/issues/defamation-drama',
    'https://www.morningbrew.com/daily/issues/first-bet',
    'https://www.morningbrew.com/daily/issues/og',
    'https://www.morningbrew.com/daily/issues/eighth-inning-cold-one',
    'https://www.morningbrew.com/daily/issues/eurotrip',
    'https://www.morningbrew.com/daily/issues/party-s-over',
    'https://www.morningbrew.com/daily/issues/plugging-a-leak',
    'https://www.morningbrew.com/daily/issues/picking-up-shifts',
    'https://www.morningbrew.com/daily/issues/free-trips',
    'https://www.morningbrew.com/daily/issues/burrito-brawl',
    'https://www.morningbrew.com/daily/issues/breaking-the-seal',
    'https://www.morningbrew.com/daily/issues/lunar-lineup',
    'https://www.morningbrew.com/daily/issues/ultimate-tag-team',
    'https://www.morningbrew.com/daily/issues/the-boot',
    'https://www.morningbrew.com/daily/issues/it-happened',
    'https://www.morningbrew.com/daily/issues/pause-plz',
    'https://www.morningbrew.com/daily/issues/mammoth-meatballs',
    'https://www.morningbrew.com/daily/issues/east-west-split',
    'https://www.morningbrew.com/daily/issues/i-m-him',
    'https://www.morningbrew.com/daily/issues/irrational',
    'https://www.morningbrew.com/daily/issues/no-likes',
    'https://www.morningbrew.com/daily/issues/final-season',
    'https://www.morningbrew.com/daily/issues/tortoise-and-the-hare',
    'https://www.morningbrew.com/daily/issues/wiped-out',
    'https://www.morningbrew.com/daily/issues/time-s-up',
    'https://www.morningbrew.com/daily/issues/cracker-stackers',
    'https://www.morningbrew.com/daily/issues/foiled',
    'https://www.morningbrew.com/daily/issues/back-on-edge',
    'https://www.morningbrew.com/daily/issues/legal-showdown-boneless-wing-wing',
    'https://www.morningbrew.com/daily/issues/the-b-word',
    'https://www.morningbrew.com/daily/issues/made-whole',
    'https://www.morningbrew.com/daily/issues/silicon-valley-bankrun',
    'https://www.morningbrew.com/daily/issues/bank-rout',
    'https://www.morningbrew.com/daily/issues/canceled-sneakers',
    'https://www.morningbrew.com/daily/issues/getting-revenge',
    'https://www.morningbrew.com/daily/issues/steve-the-snowplow',
    'https://www.morningbrew.com/daily/issues/doctor-who',
    'https://www.morningbrew.com/daily/issues/going-live',
    'https://www.morningbrew.com/daily/issues/snow-hey-oh',
    'https://www.morningbrew.com/daily/issues/cheap-shots',
    'https://www.morningbrew.com/daily/issues/second-city',
    'https://www.morningbrew.com/daily/issues/forgiveness',
    'https://www.morningbrew.com/daily/issues/1-5x-mode',
    'https://www.morningbrew.com/daily/issues/only-one-meaning',
    'https://www.morningbrew.com/daily/issues/1-year-later',
    'https://www.morningbrew.com/daily/issues/short-supply',
    'https://www.morningbrew.com/daily/issues/break-the-internet',
    'https://www.morningbrew.com/daily/issues/top-secret',
    'https://www.morningbrew.com/daily/issues/anything-you-can-do',
    'https://www.morningbrew.com/daily/issues/unhinged',
    'https://www.morningbrew.com/daily/issues/recalled',
    'https://www.morningbrew.com/daily/issues/clean-up',
    'https://www.morningbrew.com/daily/issues/the-singles-tax',
    'https://www.morningbrew.com/daily/issues/influencer-fatigue',
    'https://www.morningbrew.com/daily/issues/super-monday',
    'https://www.morningbrew.com/daily/issues/0-stars',
    'https://www.morningbrew.com/daily/issues/what-the-balloon-knew',
    'https://www.morningbrew.com/daily/issues/jinx',
    'https://www.morningbrew.com/daily/issues/bing-s-back',
    'https://www.morningbrew.com/daily/issues/booze-bowl',
    'https://www.morningbrew.com/daily/issues/popped',
    'https://www.morningbrew.com/daily/issues/big-jobs',
    'https://www.morningbrew.com/daily/issues/greetings-ents',
    'https://www.morningbrew.com/daily/issues/brady-s-nest-egg',
    'https://www.morningbrew.com/daily/issues/price-war',
    'https://www.morningbrew.com/daily/issues/wall-street-anniversary',
    'https://www.morningbrew.com/daily/issues/pay-cuts-up-top',
    'https://www.morningbrew.com/daily/issues/pretty-pretty-good',
    'https://www.morningbrew.com/daily/issues/common-threads',
    'https://www.morningbrew.com/daily/issues/and-the-nominees-are',
    'https://www.morningbrew.com/daily/issues/candy-culture',
    'https://www.morningbrew.com/daily/issues/lunar-new-year-tragedy',
    'https://www.morningbrew.com/daily/issues/look-up',
    'https://www.morningbrew.com/daily/issues/extraordinary-measures',
    'https://www.morningbrew.com/daily/issues/twitter-garage-sale',
    'https://www.morningbrew.com/daily/issues/tiktok-u',
    'https://www.morningbrew.com/daily/issues/lasers-vs-lightning',
    'https://www.morningbrew.com/daily/issues/caught-you',
    'https://www.morningbrew.com/daily/issues/unidentified',
    'https://www.morningbrew.com/daily/issues/economic-prophets',
    'https://www.morningbrew.com/daily/issues/new-caviar',
    'https://www.morningbrew.com/daily/issues/dual-emergencies',
    'https://www.morningbrew.com/daily/issues/world-s-best',
    'https://www.morningbrew.com/daily/issues/echoes-of-jan-6',
    'https://www.morningbrew.com/daily/issues/romance-and-lies',
    'https://www.morningbrew.com/daily/issues/the-limit-does-exist',
    'https://www.morningbrew.com/daily/issues/gadgets-galore',
    'https://www.morningbrew.com/daily/issues/stalled',
    'https://www.morningbrew.com/daily/issues/prayers-for-hamlin',
    'https://www.morningbrew.com/daily/issues/the-games-edition',
    'https://www.morningbrew.com/daily/issues/photos-of-the-year',
    'https://www.morningbrew.com/daily/issues/biggest-news-story',
    'https://www.morningbrew.com/daily/issues/quitting-quietly',
    'https://www.morningbrew.com/daily/issues/so-metal',
    'https://www.morningbrew.com/daily/issues/most-consequential',
    'https://www.morningbrew.com/daily/issues/brrr',
    'https://www.morningbrew.com/daily/issues/makeover',
    'https://www.morningbrew.com/daily/issues/brief-visit',
    'https://www.morningbrew.com/daily/issues/celebration',
    'https://www.morningbrew.com/daily/issues/epic-fines',
    'https://www.morningbrew.com/daily/issues/what-is-musk-doing',
    'https://www.morningbrew.com/daily/issues/for-glory',
    'https://www.morningbrew.com/daily/issues/what-took-so-long',
    'https://www.morningbrew.com/daily/issues/stonks-response',
    'https://www.morningbrew.com/daily/issues/chilling-out',
    'https://www.morningbrew.com/daily/issues/arrested',
    'https://www.morningbrew.com/daily/issues/splashdown',
    'https://www.morningbrew.com/daily/issues/down-bad',
    'https://www.morningbrew.com/daily/issues/coming-home',
    'https://www.morningbrew.com/daily/issues/judgment-day',
    'https://www.morningbrew.com/daily/issues/exposed',
    'https://www.morningbrew.com/daily/issues/get-real',
    'https://www.morningbrew.com/daily/issues/low-ceiling',
    'https://www.morningbrew.com/daily/issues/stealth',
    'https://www.morningbrew.com/daily/issues/beep-beep',
    'https://www.morningbrew.com/daily/issues/serious-bread',
    'https://www.morningbrew.com/daily/issues/squad-up',
    'https://www.morningbrew.com/daily/issues/subtle-protests',
    'https://www.morningbrew.com/daily/issues/boiling-over',
    'https://www.morningbrew.com/daily/issues/rush-hour',
    'https://www.morningbrew.com/daily/issues/the-big-reefer',
    'https://www.morningbrew.com/daily/issues/gatekeepers',
    'https://www.morningbrew.com/daily/issues/swift-justice',
    'https://www.morningbrew.com/daily/issues/worse-than-enron',
    'https://www.morningbrew.com/daily/issues/public-enemy',
    'https://www.morningbrew.com/daily/issues/sos',
    'https://www.morningbrew.com/daily/issues/8-billion',
    'https://www.morningbrew.com/daily/issues/they-got-50',
    'https://www.morningbrew.com/daily/issues/genius',
    'https://www.morningbrew.com/daily/issues/it-s-a-party',
    'https://www.morningbrew.com/daily/issues/unfixable',
    'https://www.morningbrew.com/daily/issues/democratic-surprise',
    'https://www.morningbrew.com/daily/issues/midterm-madness',
    'https://www.morningbrew.com/daily/issues/unleaded',
    'https://www.morningbrew.com/daily/issues/what-even-is-time',
    'https://www.morningbrew.com/daily/issues/reset',
    'https://www.morningbrew.com/daily/issues/netflix-takes-on-blockbuster',
    'https://www.morningbrew.com/daily/issues/breathe',
    'https://www.morningbrew.com/daily/issues/blue-check',
    'https://www.morningbrew.com/daily/issues/admissions-shakeup',
    'https://www.morningbrew.com/daily/issues/152-year-high',
    'https://www.morningbrew.com/daily/issues/chief-twit',
    'https://www.morningbrew.com/daily/issues/not-so-big-tech',
    'https://www.morningbrew.com/daily/issues/unrecyclable',
    'https://www.morningbrew.com/daily/issues/unclogged',
    'https://www.morningbrew.com/daily/issues/midterm-prep',
    'https://www.morningbrew.com/daily/issues/contradictions',
    'https://www.morningbrew.com/daily/issues/45-days',
    'https://www.morningbrew.com/daily/issues/avogeddon',
    'https://www.morningbrew.com/daily/issues/The-BQs',
    'https://www.morningbrew.com/daily/issues/uncancelable',
    'https://www.morningbrew.com/daily/issues/trouble-on-the-mississippi',
    'https://www.morningbrew.com/daily/issues/sacrifice',
    'https://www.morningbrew.com/daily/issues/netflix-with-ads',
    'https://www.morningbrew.com/daily/issues/pay-up',
    'https://www.morningbrew.com/daily/issues/dunkin-revolt',
    'https://www.morningbrew.com/daily/issues/off-camera'
]

# Code for Loading in HTML from the Web

In [5]:
def date_style(tag):
    return tag.name == 'td' and tag.get('style') == 'font-family: Arial, Helvetica, sans-serif; font-size: 12px;color:#000000; '

def bullet_points_style(tag):
    return tag.name == 'li' and tag.get('style') == 'line-height:22px;margin-bottom:10px'

all_raw_url = []
all_data = []

def html_parser(url_list, delay = ''):
    for url in url_list:
        while True:
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')
            # get title
            title = soup.title.text.strip()

            if title != 'Morning Brew':
                # append raw url responses
                all_raw_url.append(response.text + '\n zulu')
                break  # Exit the while loop if the title is not 'Morning Brew'

            time.sleep(delay)  # delay before retrying
            
        # get dates
        dates_element = soup.find_all(date_style)
        date_texts = [date.text.strip() for date in dates_element]
        # get paragraphs
        paragraphs = soup.find_all(['p', bullet_points_style])
        paragraph_texts = [p.text.strip() for p in paragraphs]

        # dictionary with data for each url
        url_data = {
            'title': title,
            'date': date_texts,
            'paragraphs': paragraph_texts
        }

        # append data to that dictionary
        all_data.append(url_data)

In [6]:
html_parser(url_list, delay = 2)

In [7]:
# #export Morning Brew newsletters as a text file
# with open("morning_brew_json.txt", "w") as output:
#     for raw_response in all_raw_url:
#         output.write(raw_response)

In [8]:
newsletters_df = pd.DataFrame(all_data)

In [9]:
# clean the newsletter dataframe
clean_newsletters(newsletters_df)

# print out the dataframe
print(newsletters_df)

NameError: name 'clean_newsletters' is not defined