In [1]:
import re
import requests
import time

from django.core.validators import URLValidator
from django.core.exceptions import ValidationError
import pandas as pd
from tqdm import tqdm as tqdm
from bs4 import BeautifulSoup

In [34]:
import os
import datetime

# where is data stored?
data_dir = '../data/'

# intermediates
tribune_file = os.path.join(data_dir, 'tribune.tsv')
sinclair_file = os.path.join(data_dir, 'sinclair.tsv')
nexstar_file = os.path.join(data_dir, 'nexstar.tsv')
meredith_file = os.path.join(data_dir, 'meredith.tsv')
hearst_file = os.path.join(data_dir, 'hearst.tsv')
stationindex_file = os.path.join(data_dir, 'station_index.tsv')
usnpl_file = os.path.join(data_dir, 'usnpl.tsv')

# this is where user entries go!
custom_station_file = os.path.join(data_dir, 'custom_additions.json')

# this is the output!
local_news_dataset_file  = os.path.join(data_dir, 'local_news_dataset_2018.csv') 

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'}

# variables
today = datetime.datetime.now()
version = 0



# for normalizing station info.
owner_mapping = {
    'Meredith Corporation' : 'Meredith',
    'Sinclair Broadcast Group' : 'Sinclair',
    'Nexstar Media Group' : 'Nexstar',
    'Hearst Television' : 'Hearst'
}

station_index_mapping = {
    'owner' : 'broadcaster'
}

national = [
    'comettv.com',
    'tbn.org',
    'iontelevision.com',
    'tct-net.org',
    'sbgi.net',
    'daystar.com',
]

look_up = {' Honolulu' : 'HI',
 ' Kalamazoo. MI' : 'MI',
' San Antonio' : 'TX'}

col_standard = {
    'station' : 'name',
    'twitter_name' : 'twitter',
    'geography' : 'state',
    'broadcaster' : 'owner'
}

cols_standard_nexstar = {
    'Web Site' : 'website',
    'Station' : 'station',
    'Affiliation' : 'network'
} 

cols_nexstar = ['station', 'website', 'city', 'state', 'broadcaster', 'source']

cols = ['name', 'state', 'website', 'twitter', 'youtube', 'facebook', 'owner', 'medium', 'source', 'collection_date']
cols_final = ['name', 'state', 'website', 'domain', 'twitter', 'youtube', 'facebook', 'owner', 'medium', 'source', 'collection_date']

# to align nexstar websites to station names
nexstar_alignment = {

    'krqe.com' : [
        'KRQE',
        'KBIM',
        'KREZ',
    ],

    'kwbq.com' : [
        'KWBQ',
        'KASY',
        'KRWB'
    ] ,

    'kark.com' : [
        'KARK',
        'KARZ'
    ],

    'fox16.com' : [
        'KLRT'
    ],

    'cwarkansas.com' : [
        'KASN '
    ],

    'woodtv.com' : [
        'WOOD',
    ],

    'wotv4women.com' : [
        'WOTV',
        'WXSP-CD'

    ],
    
    'wkbn.com' : [
        'WKBN'
    ],
    
    'wytv.com' : [
        'WYTV',
        'WYFX-LD'
    ]  
}

# for USNPL
states = '''ak	  al	  ar	  az	  ca	  co	  ct	  dc	  de	  fl	  ga	  hi	  ia	  id	  il	  in	  ks   ky	  la	  ma	  md	  me	  mi	  mn	  mo	  ms	  mt	  nc	  nd	  ne	  nh	  nj	  nm	  nv	  ny	  oh	  ok	  or	  pa	  ri	  sc	  sd	  tn	  tx	  ut	  va	  vt	  wa	  wi	  wv	  wy	'''
states = [s.strip() for s in states.split('  ')]

# for stationindex
city_state = {
    'New York' : 'NY',
    'Los Angeles' : 'CA',
    'Chicago' : 'IL',
    'Philadelphia' : 'PA',
    'Dallas' : 'TX',
    'Washington, D.C.' : 'DC',
    'Houston' : "TX",
    'Seattle' : 'WA',
    'South Florida' : 'FL',
    'Denver' : 'CO',
    'Cleveland': 'OH',
    'Sacramento' : 'CA',
    'San Diego' : 'CA',
    'St. Louis' : 'MO',
    'Portland' : 'OR',
    'Indianapolis' : 'IN',
    'Hartford' :'CT',
    'Kansas City' :'MO',
    'Salt Lake City' : 'UT',
    'Milwaukee' : 'WI',
    'Waterbury' : 'CT',
    'Grand Rapids' : 'MI',
    'Oklahoma City': 'OK',
    'Harrisburg' : 'VA',
    'Norfolk' : 'VA',
    'Greensboro/High Point/Winston-Salem' : 'NC',
    'Memphis' : 'TN',
    'New Orleans' : 'LA',
    'Wilkes-Barre/Scranton' : 'PA',
    'Richmond' : 'VA',
    'Des Moines' : 'IL',
    'Huntsville' : 'AL',
    'Moline, IL / Davenport, IA' : "IL/IA",
    'Fort Smith' : "AK",
    'America' : 'National'
}

not_actually_local = [
    'variety.com', 'investors.com', 'hollywoodreporter.com', 'bizjournals.com'
]

In [47]:
test_states = ["AL", "NM"]

def download_usnpl():
    '''
    usnpl has metadata about many newspapers in different states.
    '''
    
    sites = []
    
#    for state in states:
    for state in test_states:
        url = 'https://www.usnpl.com/search/state?state={}'.format(state)
        r = requests.get(url, headers=headers)
        soup = BeautifulSoup(r.content, 'lxml')
        
        main_table = soup.find('table', class_='table table-sm')
        
        if main_table:
            rows = main_table.find_all('tr')
            # Remove non-data rows
            rows = [row for row in rows if 'table-dark' not in row.get('class', [])]
            current_city = ""
            for row in rows:
                city_element = row.find('h4', class_='result_city')
                if city_element:
                    current_city = city_element.text.strip()
                    continue
                # Extract data From the row
                data_points = row.find_all('td')
                if len(data_points) >= 6:
                    newspaper_name = data_points[0].find('a').text.strip() if data_points[0].find('a') else ''
                    usnpl_page = data_points[0].find('a')['href'] if data_points[0].find('a') else ''
                    website = data_points[1].find('a')['href'] if data_points[1].find('a') else ''
                    twitter = data_points[2].find('a')['href'] if data_points[2].find('a') else ''
                    facebook = data_points[3].find('a')['href'] if data_points[3].find('a') else ''
                    instagram = data_points[4].find('a')['href'] if data_points[4].find('a') else ''
                    youtube = data_points[5].find('a')['href'] if data_points[5].find('a') else ''
                else:
                    print(current_city)
                    print(data_points)
                    continue

                # Extract Data From the Newspaper Page
                sub_url = f"https://www.usnpl.com/search/{usnpl_page}"
                r = requests.get(sub_url, headers=headers)
                sub_soup = BeautifulSoup(r.content, 'lxml')
                sub_table = sub_soup.find_all('tr')
                address_element = sub_table[1]
                address_parts = [part.strip() for part in address_element.stripped_strings]
                address = ' '.join(address_parts)
                editor = sub_soup.find('strong', text='Editor:').find_next_sibling(text=True).strip()
                phone = sub_soup.find('strong', text='Phone:').find_next_sibling(text=True).strip()

                # Parsed Object
                parsed_object = {
                    "State": state,
                    "City": current_city,
                    "Name": newspaper_name,
                    "Website": website,
                    "Twitter": twitter,
                    "Facebook": facebook,
                    "Instagram": instagram,
                    "Youtube": youtube,
                    "Address": address,
                    "Editor": editor,
                    "Phone": phone
                }
                
                # Add to the list
                sites.append(parsed_object)

    df = pd.DataFrame(sites)
    df['Website'] = df['Website'].str.rstrip('/')
    df['source'] = 'usnpl.com'
    df['collection_date'] = today
    
#     if os.path.exists(usnpl_file):
#         # appending to old
#         df_ = pd.read_csv(usnpl_file, sep='\t')
#         df = df[~df['Name'].isin(df_['Name'])]
#         df = df_.append(df) 
    
#     df.to_csv(usnpl_file, index=False, sep='\t')

    print(df)
    
download_usnpl()

  editor = sub_soup.find('strong', text='Editor:').find_next_sibling(text=True).strip()
  phone = sub_soup.find('strong', text='Phone:').find_next_sibling(text=True).strip()


Bay Minette, AL
[<td colspan="6">
</td>]
Columbiana, AL
[<td colspan="6">
</td>]
Fayette, AL
[<td colspan="6">
</td>]
Gulf Shores, AL
[<td colspan="6">
</td>]
Lanett, AL
[<td colspan="6">
</td>]
Opp, AL
[<td colspan="6">
</td>]
Talladega, AL
[<td colspan="6">
</td>]
Clovis, NM
[<td colspan="6">
</td>]
Las Vegas, NM
[<td colspan="6">
</td>]
Socorro, NM
[<td colspan="6">
</td>]
    State                       City                     Name  \
0      AL            Albertville, AL   Sand Mountain Reporter   
1      AL         Alexander City, AL   Alexander City Outlook   
2      AL              Andalusia, AL      Andalusia Star-News   
3      AL               Anniston, AL            Anniston Star   
4      AL                   Arab, AL             Arab Tribune   
..    ...                        ...                      ...   
131    NM                Socorro, NM    El Defensor Chieftain   
132    NM               Timberon, NM  Timberon Mountain Times   
133    NM  Truth or Consequences, NM