# ECONOMIC LINKAGES: A Text-Based Approach (in development)
### Create unique company peer-groups using topic modelling and clustering algorithms on text from quarterly earnings calls
- Input data from company conference calls (fool.com) and long business descriptions (IEX)
- Technical hat-tip to [machinelearningplus.com](https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/) for providing comprehensive instruction for python's gensim library

### Imports

In [1]:
import pandas as pd
import numpy as np

from gensim.parsing.preprocessing import remove_stopwords
from gensim import corpora
from gensim import models
from gensim.utils import simple_preprocess
from gensim import similarities
from gensim.parsing.porter import PorterStemmer

from utils_s3 import get_etf_holdings, list_keys

import pyLDAvis
import pyLDAvis.gensim

import warnings
warnings.filterwarnings('ignore')

### Intiailize Raw Conference Call Data

#### Read Raw Input Data

In [64]:
# get list of quarterly calls (events) generated from download_transcripts.py
calls_raw = pd.read_csv('./extracts/foolcalls_extract_20200814.csv')

#### Filter Data
Only keep the _**presentation section**_ from calls from _**2019**_ for the _**largest 1000 companies**_ of the Russell 3000 (i.e. R1000)

In [65]:
# only calls from 2019
calls_2019 = calls_raw.loc[calls_raw['fiscal_period_year'] == 2019, ]

# remove statement_types that are either unknown or operator (i.e. keeping P, Q, A)
calls_2019_P = calls_2019.loc[calls_raw['statement_type'].isin(['P'])]

# only analyze 500 largest stocks from the R3000
russell_1000 = get_etf_holdings('IWV', '2020-07-31').sort_values('weight',ascending=False).head(1000)
calls_2019_P_R1000 = calls_2019_P.merge(russell_1000,on='ticker',how='inner')

#### Clean & Structure

In [68]:
# remove rows with missing text (nan)
calls_2019_P_R1000 = calls_2019_P_R1000.dropna(subset=['text'])

# join text by ticker (i.e. combine individual statements from the same call and combine calls from quarters/years)
calls = calls_2019_P_R1000.loc[:, ['ticker','text']].groupby(['ticker'])['text'].apply(lambda x: ''.join(x)).reset_index()

# assign one company name per ticker because they aren't consistent (e.g. Apple Inc vs Apple, Inc)
ticker_info = calls_2019_P_R1000[['ticker','company_name','weight','sector']].groupby('ticker').head(1)

# merge back to final data frame
calls = calls.merge(ticker_info,on='ticker',how='inner')
calls['ticker_name'] = calls['ticker'] + ': ' + calls['company_name']

## Pre-Processing

In [69]:
# tokenize and remove punctuation
calls['text'] = calls['text'].apply(lambda x: simple_preprocess(x, min_len=2, max_len=15, deacc=True))

# Build the bigram model
bigram = models.Phrases(calls['text'], min_count=5, threshold=50) # higher threshold fewer phrases.
bigram_model = models.phrases.Phraser(bigram)
calls['text'] = calls['text'].apply(lambda x: bigram_model[x])

# remove stopwords
calls['text'] = calls['text'].apply(lambda x: [remove_stopwords(w) for w in x])

# stemming (porter)
p = PorterStemmer()
calls['text'] = calls['text'].apply(lambda x: p.stem_documents(x))

# create dictionary object
dictionary = corpora.Dictionary(calls['text'])

# filter extremes
dictionary.filter_extremes(no_below=5, no_above=0.20)

# bag-of-words transformation
corpus = [dictionary.doc2bow(text) for text in calls['text']]

# tfidf transformation
tfidf = models.TfidfModel(corpus)  # fit model
corpus_tfidf = tfidf[corpus]  # apply model


## Methodology 1: LSI Model Transformation

In [70]:
# lsi transformation
lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100)  # initialize an LSI transformation
corpus_lsi = lsi_model[corpus_tfidf]  # apply model
# for t in lsi_model.show_topics(): # print topics
#    print(t)

### Document Similarities (LSI)

In [71]:
# cosine similarity matrix with lsi
index_lsi = similarities.MatrixSimilarity(corpus_lsi)
sims_lsi = index_lsi[corpus_lsi]

##### Construct & Format Outputs for Analysis

In [92]:
# get top 10 peers for each ticker
# n = len(sims_lsi)
# for i, s in enumerate(sims_lsi):
#     print(calls['ticker_infoticker_name'].iloc[i])
#     print(calls['ticker_name'].iloc[s.argsort()[::-1][:n][1:10]])



# get top peers for each ticker
peers_lsi = pd.DataFrame()
for i, s in enumerate(sims_lsi):
    peers_idx = np.where(s > 0.50)[0].tolist()
    this_df = pd.DataFrame({'company': [calls['ticker_name'].iloc[i]]*len(peers_idx),
                            'weight': [calls['weight'].iloc[i]]*len(peers_idx),
                            'sector': [calls['sector'].iloc[i]]*len(peers_idx),
                            'peer': calls['ticker_name'].iloc[peers_idx],
                            'value': s[peers_idx]})
    peers_lsi = pd.concat([peers_lsi, this_df])

#### Explore Company-Level Peers (LSI)

In [94]:
peers_lsi = peers_lsi.dropna()
peers_lsi.loc[peers_lsi.company.str.contains('MSFT'),].sort_values('value',ascending=False)[1:]

Unnamed: 0,company,weight,sector,peer,value
279,MSFT: Microsoft,0.0475,Information Technology,EPAM: EPAM Systems Inc,0.694204
214,MSFT: Microsoft,0.0475,Information Technology,CTSH: Cognizant Technology Solutions Corp,0.692845
150,MSFT: Microsoft,0.0475,Information Technology,CDW: CDW,0.682461
301,MSFT: Microsoft,0.0475,Information Technology,EXLS: ExlService Holdings Inc,0.678646
577,MSFT: Microsoft,0.0475,Information Technology,NCR: NCR Corp,0.626939
39,MSFT: Microsoft,0.0475,Information Technology,AKAM: Akamai Technologies Inc,0.626913
381,MSFT: Microsoft,0.0475,Information Technology,HBI: Hanesbrands Inc,0.622895
399,MSFT: Microsoft,0.0475,Information Technology,HPE: Hewlett Packard Enterprise Company,0.615448
438,MSFT: Microsoft,0.0475,Information Technology,IQV: IQVIA Holdings Inc.,0.606855
538,MSFT: Microsoft,0.0475,Information Technology,MKC: McCormick,0.604284


#### Export to Cytoscape

In [82]:
peers_lsi.to_csv('output/peers_lsi.csv',sep='|',index=False)

#### Preliminary Network Output

![test](files/output/peers_lsi.png)

## Methodology 2: LDA Topic Modelling

In [None]:
# lda transformation
lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20)
corpus_lda = lda[corpus] # apply lda model to corpus
lda.print_topics()

# data visualization
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
vis

### Document Similarities (LDA)

In [None]:
# cosine similarity matrix with lda
index_lda = similarities.MatrixSimilarity(corpus_lda)
sims_lda = index_lda[corpus_lda]

# get top 10 peers for each ticker
n = len(sims_lda)
for i, s in enumerate(sims_lda):
    print(calls['ticker_name'].iloc[i])
    print(calls['ticker_name'].iloc[s.argsort()[::-1][:n][0:10]])

# get top peers for each ticker
peers_lda = pd.DataFrame()
for i, s in enumerate(sims_lda):
    peers_idx = np.where(s > 0.95)[0].tolist()
    this_df = pd.DataFrame({'company': [calls['ticker_name'].iloc[i]]*len(peers_idx),
                            'peer': calls['ticker_name'].iloc[peers_idx],
                            'value': s[peers_idx]})
    peers_lda = pd.concat([peers_lda, this_df])

#### Explore Company-Level Peers (LDA)

In [None]:
peers_lda.loc[peers_lda.company.str.contains('AMZN'),].sort_values('value',ascending=False)[1:]