<a href="https://colab.research.google.com/github/uedah1999/GFScraping/blob/main/nds_clip_loader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Documentation and usage
This Colab notebook was written by Hiromichi Ueda '21 (DataSquad) in February 2021
Last execution in February 2021 with Python 3.8.3, Selenium 3.141.0, Pandas 1.2.0

The script creates a csv file from .eml files of emails from News Data Service in a folder, then downloads the broadcast clips as mp4 files in assigned folder.

For basic usage of how to run colab, go to [official intro](https://colab.research.google.com/notebooks/intro.ipynb#recent=true). 

# Install required modules and mount drive data

In [None]:
# install chromium, its driver, and selenium
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium

In [None]:
# mount Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
# change this line to navigate to the desired location
%cd drive/MyDrive/COMPS/

# Body of code

In [None]:
import pandas as pd
import time
import requests
import email
import os
from email.header import decode_header
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options

In [None]:
# originally created by https://qiita.com/denzow/items/a42d344fa343cd80cf86 (blog post in Japanese)
class MailParser(object):
    """
    parse an eml file whose path is given
    """

    def __init__(self, mail_file_path):
        self.mail_file_path = mail_file_path
        # obtain email.message.Message instance from eml file
        with open(mail_file_path, 'rb') as email_file:
            self.email_message = email.message_from_bytes(email_file.read())
        self.subject = None
        self.to_address = None
        self.cc_address = None
        self.from_address = None
        self.body = ""
        # dictionary for attachment
        # {name: file_name, data: data}
        self.attach_file_list = []
        self._parse()

    def get_attr_data(self):
        """
        obtain email data
        """
        result = """\
        FROM: {}
        TO: {}
        CC: {}
        SUBJECT: {}
        -----------------------
        BODY:
        {}
        -----------------------
        ATTACH_FILE_NAME:
        {}
        """.format(
            self.from_address,
            self.to_address,
            self.cc_address,
            self.subject,
            self.body,
            ",".join([ x["name"] for x in self.attach_file_list])
        )
        return result


    def _parse(self):
        """
        parse eml file
        called by __init__
        """
        self.subject = self._get_decoded_header("Subject")
        self.to_address = self._get_decoded_header("To")
        self.cc_address = self._get_decoded_header("Cc")
        self.from_address = self._get_decoded_header("From")

        # parse body
        for part in self.email_message.walk():
            # if ContentType is multipart, skip
            if part.get_content_maintype() == 'multipart':
                continue
            # obtain attachment file name
            attach_fname = part.get_filename()
            # if there is no attachment name, then parse the body
            if not attach_fname:
                charset = str(part.get_content_charset())
                if charset:
                    self.body += part.get_payload(decode=True).decode(charset, errors="replace")
                else:
                    self.body += part.get_payload(decode=True)
            else:
                # if there is an attachment name, obtain the file name and data
                self.attach_file_list.append({
                    "name": attach_fname,
                    "data": part.get_payload(decode=True)
                })

    def _get_decoded_header(self, key_name):
        """
        obtain decoded header
        """
        ret = ""

        # if no corresponding item, return empty string
        raw_obj = self.email_message.get(key_name)
        if raw_obj is None:
            return ""
        # turn decoded result into unicode
        for fragment, encoding in decode_header(raw_obj):
            if not hasattr(fragment, "decode"):
                ret += fragment
                continue
            # unless encoding is specified, decode with utf-8
            if encoding:
                ret += fragment.decode(encoding)
            else:
                ret += fragment.decode("UTF-8")
        return ret

In [None]:
# returns whether the email contains the url, the program name and url if they exist
def get_program_name_url(nds_eml_path):
    result = MailParser(nds_eml_path)
    if result.subject.split(',')[0] != 'Media Archive Clip Processed':
        return False, '', ''
    else:
        program_name = ''
        program_url = ''
        for line in result.body.split('<br>'):
            if len(line) > 5:
                if line[:12]=='PROGRAM NAME': # Line containing clip name
                    program_name = line[14:]
                if line[:12]=='You can view': # Line containing the hyperlink
                    program_url = line.split('\"')[1]
        return True, program_name, program_url

In [None]:
def nds_email_scraper(email_dir, csv_file_path):
    programs_list = []
    print('scraping emails')
    count = 0
    for filename in os.listdir(email_dir):
        eml_file_path = email_dir + '/' + filename
        processed_eml, prog_name, prog_url = get_program_name_url(eml_file_path)
        if processed_eml:
            programs_list.append([prog_name, prog_url, False])
        count += 1
        if count%50 == 0:
          print('scraped {} emails'.format(count))
    df = pd.DataFrame(programs_list, columns=['name', 'url', 'downloaded'])
    df.to_csv(csv_file_path, index=False)
    print('emails have been scraped')

In [None]:
def nds_clip_loader(csv_file_path, mp4_dir):
    # options to make webdriver run in the background 
    # taken from https://medium.com/@darektidwell1980/using-selenium-with-google-colaboratory-ca4a4f21021f
    op = webdriver.ChromeOptions()
    op.add_argument('-headless')
    op.add_argument('-no-sandbox')
    op.add_argument('-disable-dev-shm-usage')

    eml_df = pd.read_csv(csv_file_path)

    driver = webdriver.Chrome('chromedriver',options=op)
    print('start downloading clips')
    for index, row in eml_df.iterrows():
        prog_name = row['name']
        prog_station = prog_name[:4]
        prog_url = row['url']
        driver.get(prog_url)
        time.sleep(10)
        if row['downloaded']:
            print('N', end='') # clip has been downloaded already
        else:
            try:
                mp4_link = driver.find_element_by_xpath('//*[@id="DownloadText"]/a').get_attribute('href')
                r = requests.get(mp4_link, stream = True)
                # download started
                # change the filepath appropriately to meet the purpose  
                with open('{}/{} Video/{}.mp4'.format(mp4_dir, prog_station, prog_name), 'wb') as f:  
                    for chunk in r.iter_content(chunk_size = 1024*1024):
                        if chunk:
                            f.write(chunk)
                eml_df.loc[index, 'downloaded'] = True
                print('S', end='') # clip downloaded
            except:
                print('F', end='') # clip not downloaded
                driver.quit()
                driver = webdriver.Chrome(options=op)
    
        if (index+1)%50 == 0:
            eml_df.to_csv(csv_file_path, index=False)
            print('')
    driver.quit()
    print('\ndownloading complete')
    eml_df.to_csv(csv_file_path, index=False)

# Run the main function

In [None]:
# change the file and folder path appropriately
email_dir = './NDSEmail'
mp4_dir = './Local Video'
csv_file_path = './Local Video/nds_clips.csv'

In [None]:
# scrape emails
nds_email_scraper(email_dir, csv_file_path)

In [None]:
# download mp4 clips from each hyperlink
nds_clip_loader(csv_file_path, mp4_dir)

In [None]:
df = pd.read_csv(csv_file_path)
df['downloaded'].value_counts()

In [None]:
df_unscraped = df[~df['downloaded']]
df_unscraped.to_csv('./Local Video/nds_unscraped_clips.csv')