In [1]:
#import required libraries
import requests
from bs4 import BeautifulSoup
import os

# MSNBC Web-Scraping

## Step 1: Get the data from the home page of the transcripts website
- get a list of the dates across the entire dataset
- create a list and just iterate through the list and append the dataset to get 

Source on web-scraping: https://realpython.com/beautiful-soup-web-scraper-python/#step-3-parse-html-code-with-beautiful-soup 


## Get to the website and convert to a soup object

In [2]:
#step 1: store the URL
rachel_maddow_URL = "https://www.msnbc.com/transcripts/show/rachel-maddow-show"

#step 2: get to the landing page of the website
landing_page = requests.get(rachel_maddow_URL)

#step 3: convert page into html
soup = BeautifulSoup(landing_page.content, "html.parser")

#below is a potential alternative additional link to look at/start with
# all show transcripts from 3/1/2020 to 3/31/2022
#https://www.msnbc.com/transcripts/show/rachel-maddow-show?dateRange=2020-03-01+TO+2022-03-31

## Get the Transript Feed (A collection of individual transcript cards)

In [3]:
#<div class="transcripts-page">
#<div class="transcripts-feed">

#step 4: extract the transcript feed page part of the soup
transcript_feed = soup.find(class_="transcripts-feed")

#print out example of the html output that is contained in transcript feed
print(transcript_feed.prettify()[:500])


<div class="transcripts-feed">
 <div class="transcript-card">
  <div class="transcript-card__air-date">
   Mar 16 2022
  </div>
  <div class="transcript-card__details">
   <a class="transcript-card__show-name" href="https://www.msnbc.com/transcripts/rachel-maddow-show/transcript-rachel-maddow-show-3-15-22-n1292030">
    The Rachel Maddow Show
   </a>
   <a class="transcript-card__headline" href="https://www.msnbc.com/transcripts/rachel-maddow-show/transcript-rachel-maddow-show-3-15-22-n1292030">


## Extract Individual Transcript Cards 

In [4]:
#step 5: get all of the transcript cards -- creates an object that is iterable to look at
transcript_cards = transcript_feed.find_all("div", class_ ="transcript-card")

In [6]:
#step 6: display the results of the above
for card in transcript_cards[:2]:
    print(card, end="\n"*2)

<div class="transcript-card"><div class="transcript-card__air-date">Mar 16 2022</div><div class="transcript-card__details"><a class="transcript-card__show-name" href="https://www.msnbc.com/transcripts/rachel-maddow-show/transcript-rachel-maddow-show-3-15-22-n1292030">The Rachel Maddow Show</a><a class="transcript-card__headline" href="https://www.msnbc.com/transcripts/rachel-maddow-show/transcript-rachel-maddow-show-3-15-22-n1292030">Transcript: The Rachel Maddow Show, 3/15/22</a><span class="transcript-card__guests"><span class="">Guests: Serhiy Leshchenko, Amy Klobuchar, Alexander Goncharov</span></span></div></div>

<div class="transcript-card"><div class="transcript-card__air-date">Mar 15 2022</div><div class="transcript-card__details"><a class="transcript-card__show-name" href="https://www.msnbc.com/transcripts/rachel-maddow-show/transcript-rachel-maddow-show-3-14-22-n1292021">The Rachel Maddow Show</a><a class="transcript-card__headline" href="https://www.msnbc.com/transcripts/ra

## Extract Air Date, Transcript URL, & Guests From Each Transcript Card

In [30]:
#create the nested dictionary to hold the results (transcript url, guest, and text of the show)
transcript_info = {} #key = air date, value is a dictionary with the (transcript url, guest, and text of the show)

#iterate through the transcript cards & extract information
for card in transcript_cards[:1]:
    air_date = card.find("div", class_="transcript-card__air-date").text
    transcript_url = card.find("a", class_="transcript-card__show-name")["href"] #get the link for the website
    guests = card.find("span", class_="transcript-card__guests").text[8:].split(",")#removes a "guests:" tag from in front of the list of guests 

    transcript_info[air_date] = {"transcript_url": transcript_url,
                                 "guests": guests }
    
print(transcript_info)

{'Mar 16 2022': {'transcript_url': 'https://www.msnbc.com/transcripts/rachel-maddow-show/transcript-rachel-maddow-show-3-15-22-n1292030', 'guests': ['Serhiy Leshchenko', ' Amy Klobuchar', ' Alexander Goncharov']}}


## Now, iterate through all of the transcript info to add in the actual text of the transcripts

In [44]:
# #create a list of the air dates
# air_dates = list(transcript_info.keys())
# #print(air_dates[0])

# transcript_info[air_dates[0]]["transcript_url"]

for air_date in list(transcript_info.keys()):

    #get needed information:
    show_url = transcript_info[air_date]["transcript_url"]
    print(show_url)
    

    #go to the url
    show_page = requests.get(show_url)

    #convert the page into a soup object
    show_soup =  BeautifulSoup(show_page.content, "html.parser")


    #get the content of the transcript = <div class="article-body__content">

    #extract the transcript feed page part of the soup
    transcript_text = show_soup.find(class_="article-body__content").text

    #add the transcript text to the dictionary
    transcript_info[air_date]["transcript_text"] = transcript_text

    

    



https://www.msnbc.com/transcripts/rachel-maddow-show/transcript-rachel-maddow-show-3-15-22-n1292030


# Export the text of the transcript file

In [58]:
#export the file
for air_date in transcript_info:

    #extract the text from the transcripts
    text = transcript_info[air_date]["transcript_text"]

    #create a file name + output file path
    list_air_date = air_date.split()
    
    
    filename = list_air_date[0] + "_" + list_air_date[1] + "_" + list_air_date[2] + "_" + "rachel" + "_" + "maddow" + "_" + "transcript.txt"
    repo_path = os.path.dirname(os.getcwd())
    output_path = os.path.join(repo_path, "data", "01-raw", "rachel_maddow", filename)

    print(output_path)

    #export the file
    with open(output_path,"w") as f:
        f.write(text)
    

/Users/tobijegede/Documents/GitHub/opinion-news-nlp/data/01-raw/rachel_maddow/Mar_16_2022_rachel_maddow_transcript.txt


In [12]:
'''
Key Components of HTML output:
Each entry = a "transcript card"
Note: the transcript feed is made up of many transcript cards

Transcript Card (<div class="transcript-card">)
    Air Date: <div class="transcript-card__air-date">
    Transcript Card Details:  <div class="transcript-card__details">
        Show Name: <a class="transcript-card__show-name" href="https://www.msnbc.com/transcripts/rachel-maddow-show/transcript-rachel-maddow-show-3-15-22-n1292030">The Rachel Maddow Show</a>
        Headline: <a class="transcript-card__headline" href="https://www.msnbc.com/transcripts/rachel-maddow-show/transcript-rachel-maddow-show-3-15-22-n1292030">Transcript: The Rachel Maddow Show, 3/15/22</a> 
        Guests: <span class="transcript-card__guests"><span class="">Guests: Serhiy Leshchenko, Amy Klobuchar, Alexander Goncharov</span></span></div></div>


'''


'\nKey Components of HTML output:\nEach entry = a "transcript card"\nNote: the transcript feed is made up of many transcript cards\n\nTranscript Card (<div class="transcript-card">)\n    Air Date: <div class="transcript-card__air-date">\n    Transcript Card Details:  <div class="transcript-card__details">\n        Show Name: <a class="transcript-card__show-name" href="https://www.msnbc.com/transcripts/rachel-maddow-show/transcript-rachel-maddow-show-3-15-22-n1292030">The Rachel Maddow Show</a>\n        Headline: <a class="transcript-card__headline" href="https://www.msnbc.com/transcripts/rachel-maddow-show/transcript-rachel-maddow-show-3-15-22-n1292030">Transcript: The Rachel Maddow Show, 3/15/22</a> \n        Guests: <span class="transcript-card__guests"><span class="">Guests: Serhiy Leshchenko, Amy Klobuchar, Alexander Goncharov</span></span></div></div>\n\n\n\n\n'

# Now, iterate through all of the pages of information 

Note: create a function that conslidates all of the previous steps that you went through the above code to get to this point


    

In [9]:
'''
-- there are 25 pages worth of data in the specified date range

Example url (WITH SPECIFIED DATE RANGE): https://www.msnbc.com/transcripts/show/rachel-maddow-show?page=2&dateRange=2020-03-01+TO+2022-03-31 

starting URL: https://www.msnbc.com/transcripts/show/rachel-maddow-show?dateRange=2020-03-01+TO+2022-03-31 

all other pages:
https://www.msnbc.com/transcripts/show/rachel-maddow-show?page=2&dateRange=2020-03-01+TO+2022-03-31

total_num_pages = 25

for i in range(total_num_pages):
    page_num = i + 1 
    if page_num == 1:
     URL = https://www.msnbc.com/transcripts/show/rachel-maddow-show?dateRange=2020-03-01+TO+2022-03-31 
    
    #insert function here

    #iteratively export the files to the folder

    else: 
    URL = "https://www.msnbc.com/transcripts/show/rachel-maddow-show?page="+ page_num +  "&dateRange=2020-03-01+TO+2022-03-31"


'''

total_num_pages = 25

for i in range(total_num_pages):
    page_num = i + 1 

    if page_num == 1:
        URL = "https://www.msnbc.com/transcripts/show/rachel-maddow-show?dateRange=2020-03-01+TO+2022-03-31"
    #insert function here

    #iteratively export the files to the folder

    else: 
        URL = "https://www.msnbc.com/transcripts/show/rachel-maddow-show?page="+ str(page_num) +  "&dateRange=2020-03-01+TO+2022-03-31"

   # print(URL)

In [None]:
'''
Example URLS:
https://www.msnbc.com/transcripts/rachel-maddow-show/transcript-rachel-maddow-show-3-15-22-n1292030 
https://www.msnbc.com/transcripts/rachel-maddow-show/transcript-rachel-maddow-show-3-14-22-n1292021


'''

## Export Results as txt files with labels from dictionary names to the data folder


In [None]:
## Export Results as txt files with labels from dictionary names to the data folder
    #try getting this one page as an example
