In [1]:
import re
import requests
import time

from django.core.validators import URLValidator
from django.core.exceptions import ValidationError
import pandas as pd
from tqdm import tqdm as tqdm
from bs4 import BeautifulSoup

In [7]:
import os
import datetime

# where is data stored?
data_dir = '../data/'

# intermediates
tribune_file = os.path.join(data_dir, 'tribune.tsv')
sinclair_file = os.path.join(data_dir, 'sinclair.tsv')
nexstar_file = os.path.join(data_dir, 'nexstar.tsv')
meredith_file = os.path.join(data_dir, 'meredith.tsv')
hearst_file = os.path.join(data_dir, 'hearst.tsv')
stationindex_file = os.path.join(data_dir, 'station_index.tsv')
usnpl_file = os.path.join(data_dir, 'usnpl.tsv')

# this is where user entries go!
custom_station_file = os.path.join(data_dir, 'custom_additions.json')

# this is the output!
local_news_dataset_file  = os.path.join(data_dir, 'local_news_dataset_2018.csv') 

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'}

# variables
today = datetime.datetime.now()
version = 0



# for normalizing station info.
owner_mapping = {
    'Meredith Corporation' : 'Meredith',
    'Sinclair Broadcast Group' : 'Sinclair',
    'Nexstar Media Group' : 'Nexstar',
    'Hearst Television' : 'Hearst'
}

station_index_mapping = {
    'owner' : 'broadcaster'
}

national = [
    'comettv.com',
    'tbn.org',
    'iontelevision.com',
    'tct-net.org',
    'sbgi.net',
    'daystar.com',
]

look_up = {' Honolulu' : 'HI',
 ' Kalamazoo. MI' : 'MI',
' San Antonio' : 'TX'}

col_standard = {
    'station' : 'name',
    'twitter_name' : 'twitter',
    'geography' : 'state',
    'broadcaster' : 'owner'
}

cols_standard_nexstar = {
    'Web Site' : 'website',
    'Station' : 'station',
    'Affiliation' : 'network'
} 

cols_nexstar = ['station', 'website', 'city', 'state', 'broadcaster', 'source']

cols = ['name', 'state', 'website', 'twitter', 'youtube', 'facebook', 'owner', 'medium', 'source', 'collection_date']
cols_final = ['name', 'state', 'website', 'domain', 'twitter', 'youtube', 'facebook', 'owner', 'medium', 'source', 'collection_date']

# to align nexstar websites to station names
nexstar_alignment = {

    'krqe.com' : [
        'KRQE',
        'KBIM',
        'KREZ',
    ],

    'kwbq.com' : [
        'KWBQ',
        'KASY',
        'KRWB'
    ] ,

    'kark.com' : [
        'KARK',
        'KARZ'
    ],

    'fox16.com' : [
        'KLRT'
    ],

    'cwarkansas.com' : [
        'KASN '
    ],

    'woodtv.com' : [
        'WOOD',
    ],

    'wotv4women.com' : [
        'WOTV',
        'WXSP-CD'

    ],
    
    'wkbn.com' : [
        'WKBN'
    ],
    
    'wytv.com' : [
        'WYTV',
        'WYFX-LD'
    ]  
}

# for USNPL
states = '''ak	  al	  ar	  az	  ca	  co	  ct	  dc	  de	  fl	  ga	  hi	  ia	  id	  il	  in	  ks   ky	  la	  ma	  md	  me	  mi	  mn	  mo	  ms	  mt	  nc	  nd	  ne	  nh	  nj	  nm	  nv	  ny	  oh	  ok	  or	  pa	  ri	  sc	  sd	  tn	  tx	  ut	  va	  vt	  wa	  wi	  wv	  wy	'''
states = [s.strip() for s in states.split('  ')]

# for stationindex
city_state = {
    'New York' : 'NY',
    'Los Angeles' : 'CA',
    'Chicago' : 'IL',
    'Philadelphia' : 'PA',
    'Dallas' : 'TX',
    'Washington, D.C.' : 'DC',
    'Houston' : "TX",
    'Seattle' : 'WA',
    'South Florida' : 'FL',
    'Denver' : 'CO',
    'Cleveland': 'OH',
    'Sacramento' : 'CA',
    'San Diego' : 'CA',
    'St. Louis' : 'MO',
    'Portland' : 'OR',
    'Indianapolis' : 'IN',
    'Hartford' :'CT',
    'Kansas City' :'MO',
    'Salt Lake City' : 'UT',
    'Milwaukee' : 'WI',
    'Waterbury' : 'CT',
    'Grand Rapids' : 'MI',
    'Oklahoma City': 'OK',
    'Harrisburg' : 'VA',
    'Norfolk' : 'VA',
    'Greensboro/High Point/Winston-Salem' : 'NC',
    'Memphis' : 'TN',
    'New Orleans' : 'LA',
    'Wilkes-Barre/Scranton' : 'PA',
    'Richmond' : 'VA',
    'Des Moines' : 'IL',
    'Huntsville' : 'AL',
    'Moline, IL / Davenport, IA' : "IL/IA",
    'Fort Smith' : "AK",
    'America' : 'National'
}

not_actually_local = [
    'variety.com', 'investors.com', 'hollywoodreporter.com', 'bizjournals.com'
]

In [47]:
# Edited USNPL Function

def download_usnpl():
    '''
    usnpl has metadata about many newspapers in different states.
    '''
    
    sites = []
    
#    for state in states:
    for state in states:
        url = 'https://www.usnpl.com/search/state?state={}'.format(state)
        r = requests.get(url, headers=headers)
        soup = BeautifulSoup(r.content, 'lxml')
        
        main_table = soup.find('table', class_='table table-sm')
        
        if main_table:
            rows = main_table.find_all('tr')
            # Remove non-data rows
            rows = [row for row in rows if 'table-dark' not in row.get('class', [])]
            current_city = ""
            for row in rows:
                city_element = row.find('h4', class_='result_city')
                if city_element:
                    current_city = city_element.text.strip()
                    continue
                # Extract data From the row
                data_points = row.find_all('td')
                if len(data_points) >= 6:
                    newspaper_name = data_points[0].find('a').text.strip() if data_points[0].find('a') else ''
                    usnpl_page = data_points[0].find('a')['href'] if data_points[0].find('a') else ''
                    website = data_points[1].find('a')['href'] if data_points[1].find('a') else ''
                    twitter = data_points[2].find('a')['href'] if data_points[2].find('a') else ''
                    facebook = data_points[3].find('a')['href'] if data_points[3].find('a') else ''
                    instagram = data_points[4].find('a')['href'] if data_points[4].find('a') else ''
                    youtube = data_points[5].find('a')['href'] if data_points[5].find('a') else ''
                else:
                    continue

                # Extract Data From the Newspaper Page
                sub_url = f"https://www.usnpl.com/search/{usnpl_page}"
                r = requests.get(sub_url, headers=headers)
                sub_soup = BeautifulSoup(r.content, 'lxml')
                sub_table = sub_soup.find_all('tr')
                address_element = sub_table[1]
                address_parts = [part.strip() for part in address_element.stripped_strings]
                address = ' '.join(address_parts)
                editor = sub_soup.find('strong', text='Editor:').find_next_sibling(text=True).strip()
                phone = sub_soup.find('strong', text='Phone:').find_next_sibling(text=True).strip()

                # Parsed Object
                parsed_object = {
                    "State": state,
                    "City": current_city,
                    "Name": newspaper_name,
                    "Website": website,
                    "Twitter": twitter,
                    "Facebook": facebook,
                    "Instagram": instagram,
                    "Youtube": youtube,
                    "Address": address,
                    "Editor": editor,
                    "Phone": phone
                }
                
                # Add to the list
                sites.append(parsed_object)

    df = pd.DataFrame(sites)
    df['Website'] = df['Website'].str.rstrip('/')
    df['source'] = 'usnpl.com'
    df['collection_date'] = today
    
    if os.path.exists(usnpl_file):
        # appending to old
        df_ = pd.read_csv(usnpl_file, sep='\t')
        df = df[~df['Name'].isin(df_['Name'])]
        df = df_.append(df) 
    
    df.to_csv(usnpl_file, index=False, sep='\t')

    print(df)
    
# download_usnpl()

  editor = sub_soup.find('strong', text='Editor:').find_next_sibling(text=True).strip()
  phone = sub_soup.find('strong', text='Phone:').find_next_sibling(text=True).strip()


Bay Minette, AL
[<td colspan="6">
</td>]
Columbiana, AL
[<td colspan="6">
</td>]
Fayette, AL
[<td colspan="6">
</td>]
Gulf Shores, AL
[<td colspan="6">
</td>]
Lanett, AL
[<td colspan="6">
</td>]
Opp, AL
[<td colspan="6">
</td>]
Talladega, AL
[<td colspan="6">
</td>]
Clovis, NM
[<td colspan="6">
</td>]
Las Vegas, NM
[<td colspan="6">
</td>]
Socorro, NM
[<td colspan="6">
</td>]
    State                       City                     Name  \
0      AL            Albertville, AL   Sand Mountain Reporter   
1      AL         Alexander City, AL   Alexander City Outlook   
2      AL              Andalusia, AL      Andalusia Star-News   
3      AL               Anniston, AL            Anniston Star   
4      AL                   Arab, AL             Arab Tribune   
..    ...                        ...                      ...   
131    NM                Socorro, NM    El Defensor Chieftain   
132    NM               Timberon, NM  Timberon Mountain Times   
133    NM  Truth or Consequences, NM

In [35]:
def download_hearst():
    '''
    Downloads metadata about Hearst newspapers and broadcasting channels.

    The final DataFrame includes details such as website, name, address, phone, Twitter, Facebook, LinkedIn, 
    Instagram, station name (for broadcasting channels), broadcaster (set as "Hearst"), source (set as 
    "https://www.hearst.com/"), and collection date.

    Note: The function requires the requests, BeautifulSoup, and pandas libraries.

    Parameters:
    None

    Returns:
    None
    '''
    
    # Parse the broadcasting channels
    def parse_channel_html(channel_html):
        '''Parses bs4 html to create a dictionary (row in the dataset)'''
        website_tag = channel_html.find('a')

        # Sometime there are brand-cards that don't have any metadata attached
        if website_tag is not None:
            website = website_tag.get('href')
        else:
            return None
        
        # Extract station name from alt-text
        img_container = soup.find('div', class_='brand-logo-caption-with-text')
        image_element = img_container.find('img')
        station = alt_text = image_element['alt']

        context = dict(
            website = website,
            station = station,
            name = station,
            phone = "",
            address = "",
            twitter = "",
            facebook = "",
            linkedin = "",
            instagram = ""
        )

        return context
    
    
    # Parse the newspaper pages
    def parse_newspaper_html(newspaper_html):        
        '''Parses bs4 html to create a dictionary (row in the dataset)'''
        
        href = newspaper_html.find('a').get('href')
        sub_r = requests.get(f'https://www.hearst.com{href}', headers=headers)
        sub_soup = BeautifulSoup(sub_r.content, 'lxml')
        
        # Extract newspaper information
        data_section = sub_soup.find('section', id='content')
        name = data_section.find("h1").text.strip()
        
        main_column = data_section.find('div', id='layout-column_column-1')
        column_divs = main_column.find_all('div', recursive=False)

        contact_info = column_divs[2].find('div', class_="brand-contact-info")
        website = contact_info.find('p', class_="brand-address").find('a').get('href')
        
        address_info = column_divs[2].find('div', class_='address-container')
        address_list = [p.text.strip() for p in address_info.find_all('p')]
        phone = address_list[-1]
        address = ' '.join(address_list[:-1])
            
        social_info = column_divs[2].find('ul', class_="brand-icons")
        twitter = ''
        facebook = ''
        linkedin = ''
        instagram = ''
        for link in social_info.find_all('a'):
            img_alt = link.find('img')['alt']
            href = link['href']
            if 'twitter' in img_alt.lower():
                twitter = href
            elif 'facebook' in img_alt.lower():
                facebook = href
            elif 'linkedin' in img_alt.lower():
                linkedin = href
            elif 'instagram' in img_alt.lower():
                instagram = href
        
        column_divs[2]

        context = dict(
            website = website,
            name = name,
            address = address,
            phone = phone,
            twitter = twitter,
            facebook = facebook,
            linkedin = linkedin,
            instagram = instagram,
            station = ""
        )

        return context
    
    # -- -- -- -- -- -- -- -- -- -- -- -- --
    
    print("Downloading Hearst")
    broadcasting_url = "https://www.hearst.com/broadcasting"
    newspaper_url = "https://www.hearst.com/newspapers"
    
    # Get broadcasting data
    r = requests.get(broadcasting_url, headers=headers)
    soup = BeautifulSoup(r.content, 'lxml')
    parent_div = soup.find('div', class_='brand-card')
    channels = parent_div.find_all('div', recursive=False)
    channel_metadata = []
    for channel in channels:
        channel_meta = parse_channel_html(channel)
        if channel_meta is not None:
            channel_metadata.append(channel_meta)
    
    # get newspaper data
    r = requests.get(newspaper_url, headers=headers)
    soup = BeautifulSoup(r.content, 'lxml')
    parent_div = soup.find('div', class_='brand-card')
    newspapers = parent_div.find_all('div', recursive=False)
    newspaper_metadata = []
    for newspaper in newspapers:
        newspaper_meta = parse_newspaper_html(newspaper)
        newspaper_metadata.append(newspaper_meta)  
    
    broadcast_df = pd.DataFrame(channel_metadata)
    newspaper_df = pd.DataFrame(newspaper_metadata)
    
    df = pd.concat([broadcast_df, newspaper_df])
    
    df['broadcaster'] = 'Hearst'
    df['source'] = 'https://www.hearst.com/'
    df['collection_date'] = today
    
    if os.path.exists(hearst_file):
        # appending to old
        df_ = pd.read_csv(hearst_file, sep='\t')
        df = df[~df['station'].isin(df_['station'])]
        df = df_.append(df) 
    
    df.to_csv(hearst_file, index=False, sep='\t')
    
download_hearst()

Downloading Hearst

---address_list---

['380 Main Street', 'Beaumont, TX 77701', '(409) 833-3311']

---address_list---

['301 Merritt 7, Suite 1', 'Norwalk, CT 06851', '(203) 842-2500']

---address_list---

['117 North Second Street', 'Edwardsville, IL 62025', '(618) 656-4700']

---address_list---

['301 Merritt 7, Suite 1', 'Norwalk, CT 06851', '(203) 842-2500']

---address_list---

['4747 Southwest Freeway', 'Houston, TX 77027', '(713) 220-7171']

---address_list---

['211 North Heisterman Street', 'Bad Axe, MI 48413', '(989) 269-6461']

---address_list---

['235 W. State Street', 'Jacksonville, IL 62650', '(217) 245-6121']

---address_list---

['111 Esperanza Drive', 'Laredo, TX 78041', '(956) 728-2500']

---address_list---

['75 Maple Street', 'Manistee, MI 49660', '(231) 723-3592']

---address_list---

['301 Merritt 7, Suite 1', 'Norwalk, CT 06851', '(203) 842-2500']

---address_list---

['219 East Main Street', 'Midland, MI 48640', '(989) 835-7171']

---address_list---

['201 E.

NameError: name 'null' is not defined

In [44]:
# Get Gray TV companies https://gray.tv/companies 

def extract_gray():

    # Load the JSON data from a local file
    with open('./data/gray_tv_additions.json') as file:
        data = json.load(file)
    
    # Extract relevant fields from the JSON data
    columns = ['title', 'city', 'state', 'website']
    extracted_data = [{col: item[col] for col in columns} for item in data]

    # Create a DataFrame from the extracted data
    df = pd.DataFrame(extracted_data)

    df['broadcaster'] = 'Gray TV'
    df['source'] = 'https://gray.tv/'
    df['collection_date'] = today
    
#     if os.path.exists(gray_file):
#         # appending to old
#         df_ = pd.read_csv(gray_file, sep='\t')
#         df = df[~df['title'].isin(df_['title'])]
#         df = df_.append(df) 

    # Print the resulting DataFrame
    print(df)

extract_gray()

[{'id': '25', 'title': 'KTVF', 'arcName': 'ktvf', 'lat': '64.811149880', 'lng': '-147.703297223', 'img': '', 'city': 'Fairbanks', 'state': 'AK', 'affiliate': 'NBC/ CBS/ MY', 'dmasize': '202', 'website': 'www.webcenter11.com', 'timezone': 'ALASKA', 'zip': '99701', 'logo': 'KTVF.png', 'owner': 'gray', 'popup': '1', 'bkgColor': 'light', 'hidden': None, 'arc_site_name': 'ktvf'}, {'id': '24', 'title': 'KTUU', 'arcName': 'ktuu', 'lat': '61.185217340', 'lng': '-149.873886888', 'img': '', 'city': 'Anchorage', 'state': 'AK', 'affiliate': 'NBC/ TRUECRIME', 'dmasize': '147', 'website': 'www.alaskasnewssource.com', 'timezone': 'ALASKA', 'zip': '99503', 'logo': 'KTUU.png', 'owner': 'gray', 'popup': '1', 'bkgColor': 'light', 'hidden': None, 'arc_site_name': 'ktuu'}, {'id': '113', 'title': 'KMOT', 'arcName': 'kfyr', 'lat': '48.215191900', 'lng': '-101.318528000', 'img': '', 'city': 'Minot', 'state': 'ND', 'affiliate': 'NBC/FOX/ME', 'dmasize': '146', 'website': 'www.kfyrtv.com', 'timezone': 'CENTRAL',

In [45]:
datetime.datetime.now()

datetime.datetime(2023, 5, 17, 10, 56, 6, 89876)