In [None]:
import os
import time
import nest_asyncio
import pandas as pd
import requests
from requests_html import HTMLSession, AsyncHTMLSession
from bs4 import BeautifulSoup
nest_asyncio.apply()

# 1. List of S&P100 companies

In [None]:
wikiurl = "https://en.wikipedia.org/wiki/S%26P_100"
table_class = "wikitable sortable jquery-tablesorter"
response = requests.get(wikiurl)
response.status_code

In [None]:
soup = BeautifulSoup(response.text, 'html.parser')
sandp100 = soup.find_all('table')
df = pd.read_html(str(sandp100))
df = pd.DataFrame(df[2])
df.to_csv('sandp100.csv', index=False)
df.head()

# 2. Get links to all transcripts for each S&P 100 company
## Use seekingalpha's transcript page to get all the transcript pages for each company. Save to csv.

In [None]:
session = HTMLSession()
list_of_post_urls = []
for i, row in df.iterrows():
    print(i, row['Symbol'])
    url = 'https://seekingalpha.com/symbol/{}/earnings/transcripts'.format(row['Symbol'])
    r = session.get(url)
    fn = 'html/post_urls/{}.html'.format(row['Symbol'])
    with open(fn, 'wb') as f:
        f.write(r.content)
    time.sleep(5)

In [None]:
path = 'html/post_urls/'
post_html_files = os.listdir(path)
d = {}
for fn in post_html_files:
    if fn.endswith('.html'):
        symbol, _ = fn.split('.html')
        with open(path+fn, 'r') as f:
            soup = BeautifulSoup(f, 'html.parser')
        posts = soup.find_all('article')
        base_url = 'https://seekingalpha.com'
        for p in posts:
            title = p.find('a').text
            if any(x in title for x in ['Earnings Call Transcript', 'Earning Call Transcript']):
                abs_url = base_url + p.find('a').attrs['href'].split('?')[0]
                dt = p.find('span').text
                d[title] = [abs_url, dt, symbol]

In [None]:
post_urls = pd.DataFrame.from_dict(d, orient='index')
post_urls = post_urls.reset_index()
post_urls = post_urls.rename(columns={'index':'title', 0: 'post_url', 1:'date', 2:'symbol'})
post_urls['date'] = post_urls['date'].str[5:]
post_urls['date'] = post_urls['date'].str.replace('.','')
post_urls['date'] = post_urls['date'].str.replace(',','')
post_urls['date'] = pd.to_datetime(post_urls['date'], format='%b %d %Y')
post_urls.head()

In [None]:
post_urls.to_csv('post_urls.csv', index=False)

# 3. Filter for transcripts since 2021 

In [None]:
post_urls = pd.read_csv('post_urls.csv', parse_dates=['date'])

In [None]:
posts_in_scope = post_urls[post_urls['date'].dt.year >= 2021]
posts_in_scope.head()

In [None]:
len(posts_in_scope)

# 4. Asynchronously query each transcript page for every company (~1100 pages)

In [None]:
async def get_html(title, post_url, symbol, date):
    r = await asession.get(post_url)
    r.title = title
    r.symbol = symbol
    r.date = date
    return r

for sym in posts_in_scope['symbol'].unique():   
    print(sym)
    reqs = []
    masked = posts_in_scope[posts_in_scope['symbol'] == sym]
    urls = masked.to_dict(orient='records')
    asession = AsyncHTMLSession()
    result = asession.run(*[lambda d=d: get_html(d['title'],
                                                 d['post_url'],
                                                 d['symbol'],
                                                 d['date']) for d in urls])
    for r in result:
        l = [r.symbol, r.date.strftime('%Y-%m-%d'), r.title, '.html']
        fn = '_'.join(l)
        fp = 'html/posts/'
        with open(fp+fn, 'wb') as f:
            f.write(r.content)
    time.sleep(5)

In [None]:
# this is the non-async method
# session = HTMLSession()
# for i, row in posts_in_scope.iterrows():
#     print(row['title'])
#     fn = row['title']+'.html'
#     fp = 'html/posts/'
#     if fn not in os.listdir(fp):
#         r = session.get(row['post_url'])
#         with open(fp+fn, 'wb') as f:
#             f.write(r.content)
#         time.sleep(5)

# 5. Parse out transcript text and save to text files

In [None]:
path = 'html/posts/'
post_files = os.listdir(path)
d = {}
for fn in post_files:
    if fn.endswith('.html'):
        with open(path+fn, 'r') as f:
            soup = BeautifulSoup(f, 'html.parser')
        divs = soup.find_all('div', class_='hp_h6')
        text = divs[0].text
        with open('transcripts/{}.txt'.format(fn[:-6]), 'w') as f:
            f.write(text)