# Corporate quaterly earning call analysis

# Data Scrapping for NLP
- Below code is to scrap the data from SkingAlpha website where publicly listed companies data is available.
- selenium webdriver and BeautifulSoup libraries are used to scrap the data from website

## Libraries

In [1]:
# Standard Libraries
import pandas as pd
from datetime import datetime,time
import numpy as np
import matplotlib.pyplot as plt

# Scraping Libraries
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from time import sleep
import re

# NLP Libraries
from textblob import TextBlob
import textstat

## Functions

In [2]:
def getSentimentScore(text):
    return TextBlob(str(text)).sentiment.polarity
    
def getSubjectivity(text):
    return TextBlob(str(text)).sentiment.subjectivity

def getSentiments(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

def open_browser(alt_user_name = 'Thank you for your website'):
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('window-size=1920x1080')
    chrome_options.add_argument(f'user-agent={alt_user_name}')
    driver = webdriver.Chrome(ChromeDriverManager().install())
    return driver

### As a part of our project, we are targeting top 50 companies of S&P 500 index
### Top 50 companies of S&P500 index is taken from source: https://dailypik.com/top-50-companies-sp-500/

## CompanyName : Symbol

Apple Inc.:AAPL | Microsoft Corp.:MSFT | Amazon.com Inc.:AMZN | Facebook Inc. Class A:FB | Alphabet Inc. Class A:GOOGL | Alphabet Inc. Class C:GOOG | Tesla:TSLA | Berkshire Hathaway Inc. Class B:BRK.B | JPMorgan Chase & Co.:JPM | Johnson & Johnson:JNJ | NVIDIA Corp.:NVDA | Visa Inc. Class A:V | PayPal Holdings Inc.:PYPL | Walt Disney Company:DIS | Procter & Gamble Co.:PG | UnitedHealth Group Inc.:UNH | Home Depot Inc.:HD | MasterCard Inc. Class A:MA | Bank of America Corp.:BAC | Intel Corp.:INTC | Netflix Inc.:NFLX | Comcast Corp. Class A:CMCSA | Verizon Communications Inc.:VZ | Adobe Inc.:ADBE | salesforce.com Inc.:CRM | Abbott Laboratories:ABT | Exxon Mobil Corp.:XOM | AT&T Inc.:T | Walmart Inc.:WMT | Thermo Fisher Scientific Inc.:TMO | Cisco Systems Inc.:CSCO | Broadcom Inc.:AVGO | Pfizer Inc.:PFE | Coca-Cola Company:KO | Merck & Co. Inc.:MRK | AbbVie Inc.:ABBV | PepsiCo Inc.:PEP | Chevron Corp.:CVX | NIKE Inc. Class B:NKE | Accenture Plc Class A:ACN | Eli Lilly and Company:LLY | Texas Instruments Inc.:TXN | QUALCOMM Inc.:QCOM | Medtronic Plc:MDT | McDonald's Corp.:MCD | NextEra Energy Inc.:NEE | Costco Wholesale Corp.:COST | Danaher Corp.:DHR | Wells Fargo & Company:WFC | Union Pacific Corp.:UNP
### This companies are sorted based on Market Capitalization

In [3]:
def scrap_transcripts(urls,browser):
    print('scraping transcripts')
    list_of_dicts   = []
    for url in urls:
        print('Opening url:',url)
        browser.get(url)
        soup                    = BeautifulSoup(browser.page_source)
        p_elements              = [item.text for item in soup.find_all('p')]
        title                   = p_elements[0]
        print('scraping - ',title)
        sleep(15)

        # Finding the seperation between main speech(es) and QA section
        done = False
        for item_num in range(len(p_elements)):
            if done == True:
                break
            elif p_elements[item_num] == 'Question-and-Answer Session':
                pre_QA_title         = p_elements[:item_num - 1]
                post_QA_title         = p_elements[item_num:]
                done                  = True
            else:
                pass

        #speech = ' '.join([i for i in pre_QA_title if len(i) >= 35][1:])
        speech = ' '.join([i for i in pre_QA_title])
        #QA     = ' '.join([i for i in post_QA_title if len(i) >= 25])
        QA     = ' '.join([i for i in post_QA_title])
        speech_score = getSentimentScore(speech)
        QA_score = getSentimentScore(QA)
        current_dict = {
            'qtr_year'                  : re.findall("Q\d{1,5}\s\d{4}",title),
            'title'                     : title,
            'ticker'                    : title[title.find(":")+len(":"):title.rfind(")")],
            'speech'                    : speech,
            'speech_sentiment_score'    : speech_score,
            'speech_subjectivity'       : getSubjectivity(speech),
            'speech_complexity'         : textstat.gunning_fog(speech),
            'speech_overall_sentiments' : getSentiments(speech_score), 
            'Q_and_A'                   : QA,
            'QA_sentiment_score'        : QA_score,
            'QA_subjectivity'           : getSubjectivity(QA),
            'QA_complexity'             : textstat.gunning_fog(QA),
            'QA_overall_sentiments'     : getSentiments(QA_score),
            'URL'                       : url
        }
        list_of_dicts.append(current_dict)
    return list_of_dicts

In [5]:
# Scraping urls of each transcript before the next step.
browser         = open_browser()
main_df = pd.DataFrame()
#comp_list = ['AAPL','MSFT','AMZN','FB','GOOGL','GOOG','TSLA','BRK.B','JPM','JNJ','NVDA','PYPL','DIS','PG','UNH','HD','MA','BAC','INTC','NFLX','CMCSA','VZ','ADBE','CRM','ABT','XOM','T','WMT','TMO','CSCO','AVGO','PFE','KO','MRK','ABBV','PEP','CVX','NKE','ACN','LLY','TXN','QCOM','MDT','MCD','NEE','COST','DHR','WFC','UNP']
#done: 'AAPL','MSFT','AMZN','FB','GOOGL','GOOG','TSLA','BRK.B','JPM','JNJ','NVDA','PYPL','DIS','PG'
comp_list = ['UNH','HD','MA','BAC','INTC','NFLX','CMCSA','VZ','ADBE','CRM','ABT','XOM','T','WMT','TMO','CSCO','AVGO','PFE','KO','MRK','ABBV','PEP','CVX','NKE','ACN','LLY','TXN','QCOM','MDT','MCD','NEE','COST','DHR','WFC','UNP']
#b_url='https://seekingalpha.com/earnings/earnings-call-transcripts/'
base_url1 = 'https://seekingalpha.com/search?q='
base_url2 = '&type=keyword&tab=transcripts'
#get URLs to be scrapped - Approx 10 URLs/page
for sym in comp_list:
    df = pd.DataFrame(columns = ['qtr_year','title','ticker', 'speech', 
                            'speech_sentiment_score','speech_subjectivity', 'speech_complexity', 'speech_overall_sentiments',
                            'Q_and_A','QA_sentiment_score','QA_subjectivity','QA_complexity','QA_overall_sentiments','url_source'])
    data = []
    urls = []
    current_ts_list = base_url1+sym+base_url2
    print('URL to get latest 10 transcripts of ',sym, ' : ' ,current_ts_list)
    browser.get(current_ts_list)
    sleep(15)
    par_elements = browser.find_elements_by_class_name('item-link')
    for el in par_elements:
        chld_element = el.find_elements_by_tag_name('a')
        urls.append(chld_element[0].get_attribute('href'))
    print('no of urls:', len(urls),' for :',sym)
    data = scrap_transcripts(urls,browser)

    #convert data to dataframe
    df = df.append(data)
    file = 'G:\\Project\\Git_Repos\\NLP-Earning-call-analysis\\Data\\NLP_conf_call_data_'+sym+'.xlsx'
    df.to_excel(file)
    main_df = df
browser.close()

#write to excel - export data
#path = f_n = 'G:\\Project\\Git_Repos\\NLP-Earning-call-analysis\\Data\\NLP_conf_call_data.xlsx'
#main_df.to_excel("../Data/NLP_conf_call_data.xlsx")

[WDM] - Current google-chrome version is 89.0.4389
[WDM] - Get LATEST driver version for 89.0.4389


 


[WDM] - Driver [C:\Users\Vaibhav\.wdm\drivers\chromedriver\win32\89.0.4389.23\chromedriver.exe] found in cache


URL to get latest 10 transcripts of  AAPL  :  https://seekingalpha.com/search?q=AAPL&type=keyword&tab=transcripts
no of urls: 10  for : AAPL
scraping transcripts
Opening url: https://seekingalpha.com/article/4401478-apple-inc-aapl-ceo-tim-cook-on-q1-2021-results-earnings-call-transcript
scraping -  Apple Inc. (NASDAQ:AAPL) Q1 2021 Results Conference Call January 27, 2021  5:00 PM ET
Opening url: https://seekingalpha.com/article/4382943-apple-inc-aapl-ceo-tim-cook-on-q4-2020-results-earnings-call-transcript
scraping -  Apple Inc. (NASDAQ:AAPL) Q4 2020 Earnings Conference Call October 29, 2020  5:00 PM ET
Opening url: https://seekingalpha.com/article/4362707-apple-inc-aapl-ceo-tim-cook-on-q3-2020-results-earnings-call-transcript
scraping -  Apple Inc. (NASDAQ:AAPL) Q3 2020 Results Conference Call July 30, 2020  5:00 PM ET
Opening url: https://seekingalpha.com/article/4341792-apple-inc-aapl-ceo-tim-cook-on-q2-2020-results-earnings-call-transcript
scraping -  Apple Inc. (NASDAQ:AAPL) Q2 20

scraping -  FB Financial Corporation (NYSE:FBK) Q4 2020 Earnings Conference Call January 26, 2021  9:00 AM ET
Opening url: https://seekingalpha.com/article/4382959-facebooks-fb-ceo-mark-zuckerberg-on-q3-2020-results-earnings-call-transcript
scraping -  Facebook, Inc. (NASDAQ:FB) Q3 2020 Earnings Conference Call October 29, 2020  6:00 PM ET
Opening url: https://seekingalpha.com/article/4362679-facebook-inc-fb-ceo-mark-zuckerberg-on-q2-2020-results-earnings-call-transcript
scraping -  Facebook, Inc. (NASDAQ:FB) Q2 2020 Results Conference Call July 30, 2020  6:00 PM ET
Opening url: https://seekingalpha.com/article/4381880-fb-financial-corporation-fbk-ceo-chris-holmes-on-q3-2020-results-earnings-call-transcript
scraping -  FB Financial Corporation (NYSE:FBK) Q3 2020 Results Earnings Conference Call October 27, 2020  9:00 AM ET
Opening url: https://seekingalpha.com/article/4341309-facebook-inc-fb-ceo-mark-zuckerberg-on-q1-2020-results-earnings-call-transcript
scraping -  Facebook, Inc. (NAS

scraping -  Tesla, Inc. (NASDAQ:TSLA) Q3 2019 Earnings Conference Call October 23, 2019  6:30 PM ET
Opening url: https://seekingalpha.com/article/4193497-tesla-tsla-q2-2018-results-earnings-call-transcript
scraping -  Tesla, Inc. (NASDAQ:TSLA) Q2 2018 Earnings Call August  1, 2018  5:30 PM ET
Opening url: https://seekingalpha.com/article/4277307-tesla-inc-tsla-ceo-elon-musk-on-q2-2019-results-earnings-call-transcript
scraping -  Tesla, Inc. (NASDAQ:TSLA) Q2 2019 Earnings Conference Call July 24, 2019  6:30 PM ET
Opening url: https://seekingalpha.com/article/4169027-tesla-tsla-q1-2018-results-earnings-call-transcript
scraping -  Tesla, Inc. (NASDAQ:TSLA) Q1 2018 Earnings Call May  2, 2018  5:30 PM ET
Opening url: https://seekingalpha.com/article/4144365-tesla-tsla-q4-2017-results-earnings-call-transcript
scraping -  Tesla, Inc. (NASDAQ:TSLA) Q4 2017 Earnings Call February  7, 2018  5:30 PM ET
URL to get latest 10 transcripts of  BRK.B  :  https://seekingalpha.com/search?q=BRK.B&type=key

URL to get latest 10 transcripts of  PYPL  :  https://seekingalpha.com/search?q=PYPL&type=keyword&tab=transcripts
no of urls: 10  for : PYPL
scraping transcripts
Opening url: https://seekingalpha.com/article/4412635-paypal-holdings-inc-pypl-ceo-dan-schulman-presents-wolfe-research-virtual-fintech-forum
scraping -  PayPal Holdings, Inc. (NASDAQ:PYPL) Wolfe Research Virtual Fintech Forum March  9, 2021 12:00 PM ET
Opening url: https://seekingalpha.com/article/4410853-paypal-holdings-inc-s-pypl-management-presents-morgan-stanley-technology-media-and-telecom
scraping -  PayPal Holdings, Inc. (NASDAQ:PYPL) Morgan Stanley Technology, Media and Telecom Conference March  2, 2021 11:45 AM ET
Opening url: https://seekingalpha.com/article/4403221-paypal-holdings-inc-pypl-ceo-dan-schulman-on-q4-2020-results-earnings-call-transcript
scraping -  PayPal Holdings, Inc. (NASDAQ:PYPL) Q4 2020 Results Conference Call February  3, 2021  5:00 PM ET
Opening url: https://seekingalpha.com/article/4384306-payp

KeyboardInterrupt: 

In [4]:
import re,os
import pandas as pd
import numpy as np
import yfinance as yf
from datetime import datetime,date,timedelta

In [59]:
#read file
df = pd.read_excel("G:\\Project\\Git_Repos\\NLP-Earning-call-analysis\\Data\\NLP_conf_call_data.xlsx")

In [60]:
#Extract date from title
pattern = '(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2},\s+\d{4}'
df1 = df
release_dt = []
for i in range(len(df1)):
    str_dt = re.findall(pattern,df['title'][i])
    #print(df['title'][i])
    #print(str_dt[0])
    release_dt.append(datetime.strptime(str_dt[0],"%B %d, %Y").strftime('%Y-%m-%d'))
df1.insert(2,'release_dt',release_dt)

In [61]:
#df1.insert('prev_day_ret%','')
#df1.insert('release_day_ret%','')
#df1.insert('next_day_ret%','')
prev_day_ret=[]
release_day_ret=[]
next_day_ret = []
for i in range(len(df1)):
    #print(i,df1['ticker'][i],df1['release_dt'][i])
    stock_data = yf.Ticker(df1['ticker'][i])
    start_dt = (datetime.strptime(df1['release_dt'][i],'%Y-%m-%d')-timedelta(4)).strftime('%Y-%m-%d')
    end_dt = (datetime.strptime(df1['release_dt'][i],'%Y-%m-%d')+timedelta(5)).strftime('%Y-%m-%d')
    stock_data_yf = stock_data.history(start = start_dt,end = end_dt)
    #print(df1['ticker'][i],'--',start_dt,'--',end_dt)
    if stock_data_yf.empty:
        print(df1['ticker'][i],'--',start_dt,'--',end_dt)
    for j in range(len(stock_data_yf)):
        #print(stock_data_yf)
        #converting index to datetime string object to compare with release_dt from main dataframe
        # this way we can compare returns and validate if there were any big movement before release of earning call?
        dt = datetime.strptime(str(stock_data_yf.index[j]), '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d')
        if dt == df1['release_dt'][i]:
            ret1 = round(100*((stock_data_yf['Close'][j-1]-stock_data_yf['Close'][j-2])/stock_data_yf['Close'][j-2]),2) #df1['prev_day_ret%'][i] = 
            ret2 = round(100*((stock_data_yf['Close'][j]-stock_data_yf['Close'][j-1])/stock_data_yf['Close'][j-1]),2) #df1['release_day_ret%'][i] = 
            ret3 = round(100*((stock_data_yf['Close'][j+1]-stock_data_yf['Close'][j])/stock_data_yf['Close'][j]),2) #df1['next_day_ret%'][i] = 
    prev_day_ret.append(ret1)
    release_day_ret.append(ret2)
    next_day_ret.append(ret3)
df1.insert(5,'prev_day_ret%',prev_day_ret)
df1.insert(6,'release_day_ret%',release_day_ret)
df1.insert(7,'next_day_ret%',next_day_ret)

In [65]:
df1.to_excel("G:\\Project\\Git_Repos\\NLP-Earning-call-analysis\\Data\\updated.xlsx")