# Introduction to Web scraping

In [1]:
!pip install requests
!pip install bs4

import requests
from bs4 import BeautifulSoup

#The library requests allows us to web scrape content
#BeautifulSoup parses html and makes navigating it much easier
r = requests.get("https://en.wikipedia.org/wiki/S%26P_100")
soup = BeautifulSoup(r.content, 'html.parser')

You should consider upgrading via the '/opt/conda/bin/python3 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/opt/conda/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
#The prettify function allows for easier viewing of the data
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   S&amp;P 100 - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"e5786aba-0f37-4e1e-ad72-15d171e1746b","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"S\u0026P_100","wgTitle":"S\u0026P 100","wgCurRevisionId":1092242052,"wgRevisionId":1092242052,"wgArticleId":2658424,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Pages with non-numeric formatnum arguments","Articles with short description","Short description matches Wikidata","1983 introductions","Am

In [3]:
#We can look for a certain element using find
#It will return the first one
title = soup.find('title')
print(title)

<title>S&amp;P 100 - Wikipedia</title>


In [4]:
#The find_all function finds all applicable elements
tables = soup.find_all('table')
print(len(tables))
print()
print()
print(tables[0].prettify())

4


<table class="infobox vcard">
 <caption class="infobox-title fn n org">
  S&amp;P 100
 </caption>
 <tbody>
  <tr>
   <th class="infobox-label" scope="row">
    Foundation
   </th>
   <td class="infobox-data">
    June 15, 1983
    <span class="noprint">
     ; 39 years ago
    </span>
    <span style="display:none">
     (
     <span class="bday dtstart published updated">
      1983-06-15
     </span>
     )
    </span>
    <sup class="reference" id="cite_ref-S&amp;P500_1-0">
     <a href="#cite_note-S&amp;P500-1">
      [1]
     </a>
    </sup>
   </td>
  </tr>
  <tr>
   <th class="infobox-label" scope="row">
    Operator
   </th>
   <td class="infobox-data">
    <a href="/wiki/S%26P_Dow_Jones_Indices" title="S&amp;P Dow Jones Indices">
     S&amp;P Dow Jones Indices
    </a>
    <sup class="reference" id="cite_ref-S&amp;P500Details_2-0">
     <a href="#cite_note-S&amp;P500Details-2">
      [2]
     </a>
    </sup>
   </td>
  </tr>
  <tr>
   <th class="infobox-label" scope="row">

In [5]:
#We can also search by class in addition to element
#For example, let's get only sortable tables
sortable_tables = soup.findAll("table", {"class": "sortable"})
print(len(sortable_tables))
print()
print()
print(sortable_tables[0].prettify())

2


<table class="wikitable sortable" id="constituents">
 <tbody>
  <tr>
   <th>
    Symbol
   </th>
   <th>
    Name
   </th>
   <th>
    Sector
   </th>
  </tr>
  <tr>
   <td>
    AAPL
   </td>
   <td>
    <a href="/wiki/Apple_Inc." title="Apple Inc.">
     Apple
    </a>
   </td>
   <td>
    Information Technology
   </td>
  </tr>
  <tr>
   <td>
    ABBV
   </td>
   <td>
    <a href="/wiki/AbbVie" title="AbbVie">
     AbbVie
    </a>
   </td>
   <td>
    Health Care
   </td>
  </tr>
  <tr>
   <td>
    ABT
   </td>
   <td>
    <a href="/wiki/Abbott_Laboratories" title="Abbott Laboratories">
     Abbott
    </a>
   </td>
   <td>
    Health Care
   </td>
  </tr>
  <tr>
   <td>
    ACN
   </td>
   <td>
    <a href="/wiki/Accenture" title="Accenture">
     Accenture
    </a>
   </td>
   <td>
    Information Technology
   </td>
  </tr>
  <tr>
   <td>
    ADBE
   </td>
   <td>
    <a href="/wiki/Adobe_Inc." title="Adobe Inc.">
     Adobe
    </a>
   </td>
   <td>
    Information Technology

In [6]:
#We can see certain attributes of an element easily
example_table = sortable_tables[0]
#For example, the class(es) through attrs
print(example_table.attrs)
print()
print(example_table.attrs["class"])

{'class': ['wikitable', 'sortable'], 'id': 'constituents'}

['wikitable', 'sortable']


In [7]:
#Or just the text through the text attribute
print(example_table.text)



Symbol

Name

Sector


AAPL

Apple

Information Technology


ABBV

AbbVie

Health Care


ABT

Abbott

Health Care


ACN

Accenture

Information Technology


ADBE

Adobe

Information Technology


AIG

AIG

Financials


AMGN

Amgen

Health Care


AMT

American Tower

Real Estate


AMZN

Amazon

Consumer Discretionary


AVGO

Broadcom

Information Technology


AXP

American Express

Financials


BA

Boeing

Industrials


BAC

Bank of America

Financials


BK

BNY Mellon

Financials


BKNG

Booking Holdings

Consumer Discretionary


BLK

BlackRock

Financials


BMY

Bristol Myers Squibb

Health Care


BRK.B

Berkshire Hathaway

Financials


C

Citigroup

Financials


CAT

Caterpillar

Industrials


CHTR

Charter Communications

Communication Services


CL

Colgate-Palmolive

Consumer Staples


CMCSA

Comcast

Communication Services


COF

Capital One

Financials


COP

ConocoPhillips

Energy


COST

Costco

Consumer Staples


CRM

Salesforce

Information Technology


CSCO

Cisco

Informa

In [8]:
#Let's see if there is an easy way to identify which table is the list of the S&P 100 by class
sortable_table_classes = [table.attrs["class"] for table in sortable_tables]
for class_list in sortable_table_classes:
    print(class_list)
#They are the same, we will need to do something else

['wikitable', 'sortable']
['wikitable', 'sortable']


In [15]:
#Let's say we are certain the word Apple Inc. will only be in the table
apple_text_elements = soup.findAll(text='Apple')
print(apple_text_elements)
assert len(apple_text_elements) == 1
print()
print("Only one element with the text Apple Inc.")

['Apple']

Only one element with the text Apple Inc.


In [16]:
#It might look like a string, but if you look at the type you will see it is not
print(type(apple_text_elements[0]))

<class 'bs4.element.NavigableString'>


In [17]:
apple_element = apple_text_elements[0]
#The parent attribute gives you the element which contains the current element
print(apple_element.parent)

<a href="/wiki/Apple_Inc." title="Apple Inc.">Apple</a>


In [18]:
#Getting the parent of the hyperlink, we are now in the <td> tag
print(apple_element.parent.parent)

<td><a href="/wiki/Apple_Inc." title="Apple Inc.">Apple</a>
</td>


In [19]:
#Then the row
print(apple_element.parent.parent.parent)

<tr>
<td>AAPL
</td>
<td><a href="/wiki/Apple_Inc." title="Apple Inc.">Apple</a>
</td>
<td>Information Technology
</td></tr>


In [20]:
#Then finally the body of the table
print(apple_element.parent.parent.parent.parent)

<tbody><tr>
<th>Symbol
</th>
<th>Name
</th>
<th>Sector
</th></tr>
<tr>
<td>AAPL
</td>
<td><a href="/wiki/Apple_Inc." title="Apple Inc.">Apple</a>
</td>
<td>Information Technology
</td></tr>
<tr>
<td>ABBV
</td>
<td><a href="/wiki/AbbVie" title="AbbVie">AbbVie</a>
</td>
<td>Health Care
</td></tr>
<tr>
<td>ABT
</td>
<td><a href="/wiki/Abbott_Laboratories" title="Abbott Laboratories">Abbott</a>
</td>
<td>Health Care
</td></tr>
<tr>
<td>ACN
</td>
<td><a href="/wiki/Accenture" title="Accenture">Accenture</a>
</td>
<td>Information Technology
</td></tr>
<tr>
<td>ADBE
</td>
<td><a href="/wiki/Adobe_Inc." title="Adobe Inc.">Adobe</a>
</td>
<td>Information Technology
</td></tr>
<tr>
<td>AIG
</td>
<td><a class="mw-redirect" href="/wiki/AIG" title="AIG">AIG</a>
</td>
<td>Financials
</td></tr>
<tr>
<td>AMGN
</td>
<td><a href="/wiki/Amgen" title="Amgen">Amgen</a>
</td>
<td>Health Care
</td></tr>
<tr>
<td>AMT
</td>
<td><a href="/wiki/American_Tower" title="American Tower">American Tower</a>
</td>
<td>

In [21]:
#You can also query for a certain parent with a tag like below
print(apple_element.find_parent('tbody'))

<tbody><tr>
<th>Symbol
</th>
<th>Name
</th>
<th>Sector
</th></tr>
<tr>
<td>AAPL
</td>
<td><a href="/wiki/Apple_Inc." title="Apple Inc.">Apple</a>
</td>
<td>Information Technology
</td></tr>
<tr>
<td>ABBV
</td>
<td><a href="/wiki/AbbVie" title="AbbVie">AbbVie</a>
</td>
<td>Health Care
</td></tr>
<tr>
<td>ABT
</td>
<td><a href="/wiki/Abbott_Laboratories" title="Abbott Laboratories">Abbott</a>
</td>
<td>Health Care
</td></tr>
<tr>
<td>ACN
</td>
<td><a href="/wiki/Accenture" title="Accenture">Accenture</a>
</td>
<td>Information Technology
</td></tr>
<tr>
<td>ADBE
</td>
<td><a href="/wiki/Adobe_Inc." title="Adobe Inc.">Adobe</a>
</td>
<td>Information Technology
</td></tr>
<tr>
<td>AIG
</td>
<td><a class="mw-redirect" href="/wiki/AIG" title="AIG">AIG</a>
</td>
<td>Financials
</td></tr>
<tr>
<td>AMGN
</td>
<td><a href="/wiki/Amgen" title="Amgen">Amgen</a>
</td>
<td>Health Care
</td></tr>
<tr>
<td>AMT
</td>
<td><a href="/wiki/American_Tower" title="American Tower">American Tower</a>
</td>
<td>

In [22]:
#With this table, we can find all rows first
table = apple_element.find_parent('tbody')
table_rows = table.find_all("tr")
print(len(table_rows))
print()
print(table_rows[0])
print()
print(table_rows[1])

102

<tr>
<th>Symbol
</th>
<th>Name
</th>
<th>Sector
</th></tr>

<tr>
<td>AAPL
</td>
<td><a href="/wiki/Apple_Inc." title="Apple Inc.">Apple</a>
</td>
<td>Information Technology
</td></tr>


In [23]:
#We could iterate through each row and grab the second element
#But it is just as clean to go straight for hyperlinks by selecting only the a elements
links = table.find_all('a')
print(len(links))
print(links[0])

100
<a href="/wiki/Apple_Inc." title="Apple Inc.">Apple</a>


In [24]:
#The attributes of each link holds the linked page
example_link = links[0]
print(example_link.attrs)
print()
print(example_link.attrs["href"])

{'href': '/wiki/Apple_Inc.', 'title': 'Apple Inc.'}

/wiki/Apple_Inc.


In [25]:
#Let's convert the links list to be the actual links
links = [link.attrs["href"] for link in links]
print(links)

['/wiki/Apple_Inc.', '/wiki/AbbVie', '/wiki/Abbott_Laboratories', '/wiki/Accenture', '/wiki/Adobe_Inc.', '/wiki/AIG', '/wiki/Amgen', '/wiki/American_Tower', '/wiki/Amazon_(company)', '/wiki/Broadcom_Inc.', '/wiki/American_Express', '/wiki/Boeing', '/wiki/Bank_of_America', '/wiki/BNY_Mellon', '/wiki/Booking_Holdings', '/wiki/BlackRock', '/wiki/Bristol_Myers_Squibb', '/wiki/Berkshire_Hathaway', '/wiki/Citigroup', '/wiki/Caterpillar_Inc.', '/wiki/Charter_Communications', '/wiki/Colgate-Palmolive', '/wiki/Comcast', '/wiki/Capital_One', '/wiki/ConocoPhillips', '/wiki/Costco', '/wiki/Salesforce', '/wiki/Cisco', '/wiki/CVS_Health', '/wiki/Chevron_Corporation', '/wiki/DuPont', '/wiki/Danaher_Corporation', '/wiki/The_Walt_Disney_Company', '/wiki/Dow_Inc.', '/wiki/Duke_Energy', '/wiki/Emerson_Electric', '/wiki/Exelon', '/wiki/Ford_Motor_Company', '/wiki/FedEx', '/wiki/General_Dynamics', '/wiki/General_Electric', '/wiki/Gilead_Sciences', '/wiki/General_Motors', '/wiki/Alphabet_Inc.', '/wiki/Goldm

In [26]:
#Because of the potential for there to be two share classes, we should check duplicates
#np.unique() returns two objects when return_counts = True, the unique values and the counts for them
import numpy as np
links_unique, link_counts = np.unique(links, return_counts=True)
print(links_unique)
print(link_counts)

['/wiki/3M' '/wiki/AIG' '/wiki/AT%26T' '/wiki/AbbVie'
 '/wiki/Abbott_Laboratories' '/wiki/Accenture' '/wiki/Adobe_Inc.'
 '/wiki/Alphabet_Inc.' '/wiki/Altria' '/wiki/Amazon_(company)'
 '/wiki/American_Express' '/wiki/American_Tower' '/wiki/Amgen'
 '/wiki/Apple_Inc.' '/wiki/BNY_Mellon' '/wiki/Bank_of_America'
 '/wiki/Berkshire_Hathaway' '/wiki/BlackRock' '/wiki/Boeing'
 '/wiki/Booking_Holdings' '/wiki/Bristol_Myers_Squibb'
 '/wiki/Broadcom_Inc.' '/wiki/CVS_Health' '/wiki/Capital_One'
 '/wiki/Caterpillar_Inc.' '/wiki/Charles_Schwab_Corporation'
 '/wiki/Charter_Communications' '/wiki/Chevron_Corporation' '/wiki/Cisco'
 '/wiki/Citigroup' '/wiki/Colgate-Palmolive' '/wiki/Comcast'
 '/wiki/ConocoPhillips' '/wiki/Costco' '/wiki/Danaher_Corporation'
 '/wiki/Dow_Inc.' '/wiki/DuPont' '/wiki/Duke_Energy'
 '/wiki/Eli_Lilly_and_Company' '/wiki/Emerson_Electric' '/wiki/Exelon'
 '/wiki/ExxonMobil' '/wiki/FedEx' '/wiki/Ford_Motor_Company'
 '/wiki/General_Dynamics' '/wiki/General_Electric' '/wiki/General

In [27]:
#We can turn the counts array into True/False based on being greated than 1....
i = link_counts > 1
print(i)

[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False]


In [28]:
#And then index the unique links to see which links are duplicated
print(links_unique[i])
#Google and Fox have two share classes

[]


In [29]:
#Let's add in the part of the link that comes before each of these
links_unique = ["https://en.wikipedia.org/" + link for link in links_unique]
print(links_unique)

['https://en.wikipedia.org//wiki/3M', 'https://en.wikipedia.org//wiki/AIG', 'https://en.wikipedia.org//wiki/AT%26T', 'https://en.wikipedia.org//wiki/AbbVie', 'https://en.wikipedia.org//wiki/Abbott_Laboratories', 'https://en.wikipedia.org//wiki/Accenture', 'https://en.wikipedia.org//wiki/Adobe_Inc.', 'https://en.wikipedia.org//wiki/Alphabet_Inc.', 'https://en.wikipedia.org//wiki/Altria', 'https://en.wikipedia.org//wiki/Amazon_(company)', 'https://en.wikipedia.org//wiki/American_Express', 'https://en.wikipedia.org//wiki/American_Tower', 'https://en.wikipedia.org//wiki/Amgen', 'https://en.wikipedia.org//wiki/Apple_Inc.', 'https://en.wikipedia.org//wiki/BNY_Mellon', 'https://en.wikipedia.org//wiki/Bank_of_America', 'https://en.wikipedia.org//wiki/Berkshire_Hathaway', 'https://en.wikipedia.org//wiki/BlackRock', 'https://en.wikipedia.org//wiki/Boeing', 'https://en.wikipedia.org//wiki/Booking_Holdings', 'https://en.wikipedia.org//wiki/Bristol_Myers_Squibb', 'https://en.wikipedia.org//wiki/Bro

In [30]:
#Now that we have unique links, we can move on to pulling text from each page for our comparison of text context
#For example, let's grab the first page
r = requests.get(links_unique[0])
soup = BeautifulSoup(r.content, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   3M - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"5ee1c712-d32d-4fde-a907-b1a376031116","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"3M","wgTitle":"3M","wgCurRevisionId":1100686341,"wgRevisionId":1100686341,"wgArticleId":7664801,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Pages with non-numeric formatnum arguments","CS1 Dutch-language sources (nl)","Articles with short description","Short description is different from Wikidata","Use Amer

In [31]:
#Let's grab all text data which is in the paragraphs of the page
paragraphs = soup.find_all("p")
paragraphs = [paragraph.text for paragraph in paragraphs]
#Notice the first paragraph is blank
print(paragraphs[0])
print(paragraphs[1])



The 3M Company (originally Minnesota Mining and Manufacturing Company) is an American multinational conglomerate corporation operating in the fields of industry, worker safety, U.S. health care, and consumer goods.[5] The company produces over 60,000 products under several brands,[6] including adhesives, abrasives, laminates, passive fire protection, personal protective equipment, window films, paint protection films, dental and orthodontic products, electrical and electronic connecting and insulating materials, medical products, car-care products,[7] electronic circuits, healthcare software and optical films.[8] It is based in Maplewood, a suburb of Saint Paul, Minnesota.[9]



In [32]:
#We can join all paragraph elements with a delimiter between them like so:
print("-".join(["Text 1", "Text 2"]))

Text 1-Text 2


In [33]:
#Let's do this with all the paragraph data but use a space to separate
paragraphs = " ".join(paragraphs)
print(paragraphs)


 The 3M Company (originally Minnesota Mining and Manufacturing Company) is an American multinational conglomerate corporation operating in the fields of industry, worker safety, U.S. health care, and consumer goods.[5] The company produces over 60,000 products under several brands,[6] including adhesives, abrasives, laminates, passive fire protection, personal protective equipment, window films, paint protection films, dental and orthodontic products, electrical and electronic connecting and insulating materials, medical products, car-care products,[7] electronic circuits, healthcare software and optical films.[8] It is based in Maplewood, a suburb of Saint Paul, Minnesota.[9]
 3M made $35.4 billion in total sales in 2021, and ranked number 102 in the Fortune 500 list of the largest United States corporations by total revenue.[10] As of 2021[update], the company had approximately 95,000 employees, and had operations in more than 70 countries.[4]
 Five businessmen founded the Minnesota

# Applied Text Data Processing

In [34]:
#Those numbers with brackets (annotations) are annoying, time to get rid of them
#A quick intro to regular expressions
import re
test_string = """This is our test string to find out how regular expressions work. [1]
We will be seeing what kinds of basic searches and functionality we can do. [2]
Case sensitivity may matter in ouR analysis. [33]"""

In [35]:
#Find all will return all matching strings
#In this case we will get back just the exact string because it is what we are looking for
print(re.findall("ou", test_string))

['ou', 'ou', 'ou']


In [36]:
#By using [a-z], we signal the next piece can be any lower case letter a-z
print(re.findall("ou[a-z]", test_string))

['our', 'out']


In [37]:
#Or using [a-r] we limit to letters a through r
print(re.findall("ou[a-r]", test_string))

['our']


In [38]:
#It is case sensitive, so we will see that A-Z gives us capitals
print(re.findall("ou[a-zA-Z]", test_string))

['our', 'out', 'ouR']


In [39]:
#The split function splits text by the searched text
print(re.split("ou[a-zA-Z]", test_string))

['This is ', ' test string to find ', ' how regular expressions work. [1]\nWe will be seeing what kinds of basic searches and functionality we can do. [2]\nCase sensitivity may matter in ', ' analysis. [33]']


In [40]:
#Sub replaces our text where we give a pattern followed by the replacement text followed by the string we are searching
print(re.sub("ou[a-zA-Z]", "[REDACTED]",test_string))

This is [REDACTED] test string to find [REDACTED] how regular expressions work. [1]
We will be seeing what kinds of basic searches and functionality we can do. [2]
Case sensitivity may matter in [REDACTED] analysis. [33]


In [41]:
#Searching for numbers is done like so
#Notice the 33 was split
print(re.findall("[1-9]",test_string))

['1', '2', '3', '3']


In [42]:
#Replacing he numbers
print(re.sub("[1-9]","X",test_string))

This is our test string to find out how regular expressions work. [X]
We will be seeing what kinds of basic searches and functionality we can do. [X]
Case sensitivity may matter in ouR analysis. [XX]


In [43]:
#Adding + after the brackets will say pick wherever there is that occurence one or more times
print(re.findall("[1-9]+",test_string))

['1', '2', '33']


In [44]:
#To find brackets in text, we have to use \[ and \]
print(re.findall("\[",test_string))
print(re.findall("\]",test_string))

['[', '[', '[']
[']', ']', ']']


In [45]:
#Finally, we can add the three together to find occurences where there are brackets with numbers inside
print(re.findall("\[[1-9]+\]",test_string))

['[1]', '[2]', '[33]']


In [46]:
print(re.sub("\[[1-9]+\]","",test_string))

This is our test string to find out how regular expressions work. 
We will be seeing what kinds of basic searches and functionality we can do. 
Case sensitivity may matter in ouR analysis. 


In [47]:
#Test with the actual data
print(paragraphs[:400])
print()
print(re.sub("\[[1-9]+\]","",paragraphs[:400]))


 The 3M Company (originally Minnesota Mining and Manufacturing Company) is an American multinational conglomerate corporation operating in the fields of industry, worker safety, U.S. health care, and consumer goods.[5] The company produces over 60,000 products under several brands,[6] including adhesives, abrasives, laminates, passive fire protection, personal protective equipment, window films, 


 The 3M Company (originally Minnesota Mining and Manufacturing Company) is an American multinational conglomerate corporation operating in the fields of industry, worker safety, U.S. health care, and consumer goods. The company produces over 60,000 products under several brands, including adhesives, abrasives, laminates, passive fire protection, personal protective equipment, window films, 


In [48]:
paragraphs = re.sub("\[[1-9]+\]","",paragraphs)

In [49]:
import nltk
#nltk is the library we will use for text analysis
stemmer = nltk.stem.SnowballStemmer('english')

In [50]:
#The stemmer breaks down words into roots
stemmer.stem("run running")

'run run'

In [51]:
paragraphs = stemmer.stem(paragraphs)

In [52]:
#To get counts, we can use the count vectorizer
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

test_text = "Text that we want to analyze with text analysis."
test_text = stemmer.stem(test_text)
print(test_text)
print()
#The text is not readable from the initial fit_transform because it is in sparse matrix form
counts = vectorizer.fit_transform([test_text])
print(counts)

text that we want to analyze with text analysis.

  (0, 2)	2
  (0, 3)	1
  (0, 6)	1
  (0, 5)	1
  (0, 4)	1
  (0, 1)	1
  (0, 7)	1
  (0, 0)	1


In [53]:
#To array gives us the counts but not the words
print(counts.toarray())

[[1 1 2 1 1 1 1 1]]


In [54]:
#The function get_feature_names returns the names
print(vectorizer.get_feature_names())

['analysis', 'analyze', 'text', 'that', 'to', 'want', 'we', 'with']


In [55]:
import pandas as pd
#Convert the array to a series
print(pd.Series(counts.toarray()[0], index = vectorizer.get_feature_names()))

analysis    1
analyze     1
text        2
that        1
to          1
want        1
we          1
with        1
dtype: int64


In [56]:
#Stop words gets rid of the common words like a or the in the anglish language that don't add much to analysis
vectorizer = CountVectorizer(stop_words='english')
test_text = "Text that we want to analyze with text analysis."
test_text = stemmer.stem(test_text)
counts = vectorizer.fit_transform([test_text])
print(pd.Series(counts.toarray()[0], index = vectorizer.get_feature_names()))

analysis    1
analyze     1
text        2
want        1
dtype: int64


In [57]:
#Let's turn this into a function
def get_company_text(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    paragraphs = soup.find_all("p")
    paragraphs = [paragraph.text for paragraph in paragraphs]
    paragraphs = " ".join(paragraphs)
    pattern = "\[[0-9]+\]"
    paragraphs = re.sub(pattern, "", paragraphs)
    stemmer = nltk.stem.SnowballStemmer('english')
    paragraphs = stemmer.stem(paragraphs)
    vectorizer = CountVectorizer(stop_words='english')
    counts = vectorizer.fit_transform([paragraphs])
    counts = pd.Series(counts.toarray()[0],index=vectorizer.get_feature_names())
    return counts
counts = get_company_text(links_unique[0])
print(counts.sort_values(ascending=False))

3m              75
company         29
products        12
million         12
minnesota       11
                ..
manufactured     1
manner           1
management       1
making           1
harry            1
Length: 851, dtype: int64


In [58]:
#And extend to multiple companies
def get_company_text_multiple(urls):
    text = []
    for url in urls:
        r = requests.get(url)
        soup = BeautifulSoup(r.content, 'html.parser')
        paragraphs = soup.find_all("p")
        paragraphs = [paragraph.text for paragraph in paragraphs]
        paragraphs = " ".join(paragraphs)
        pattern = "\[[0-9]+\]"
        paragraphs = re.sub(pattern, "", paragraphs)
        stemmer = nltk.stem.SnowballStemmer('english')
        paragraphs = stemmer.stem(paragraphs)
        text.append(paragraphs)
    vectorizer = CountVectorizer(stop_words='english')
    counts = vectorizer.fit_transform(text)
    counts = pd.DataFrame(counts.toarray(),columns=vectorizer.get_feature_names()).transpose()
    return counts
counts = get_company_text_multiple(links_unique[0:2])
print(counts)

             0   1
000          9   0
100          1   1
100th        1   0
102          1   0
109          1   0
...         ..  ..
years        2   6
york         0  11
zaffino      0   5
zealand      0   1
zwijndrecht  1   0

[1696 rows x 2 columns]


In [59]:
#Let's find the index
index = [x[31:] for x in links_unique]
print(index)

['3M', 'AIG', 'AT%26T', 'AbbVie', 'Abbott_Laboratories', 'Accenture', 'Adobe_Inc.', 'Alphabet_Inc.', 'Altria', 'Amazon_(company)', 'American_Express', 'American_Tower', 'Amgen', 'Apple_Inc.', 'BNY_Mellon', 'Bank_of_America', 'Berkshire_Hathaway', 'BlackRock', 'Boeing', 'Booking_Holdings', 'Bristol_Myers_Squibb', 'Broadcom_Inc.', 'CVS_Health', 'Capital_One', 'Caterpillar_Inc.', 'Charles_Schwab_Corporation', 'Charter_Communications', 'Chevron_Corporation', 'Cisco', 'Citigroup', 'Colgate-Palmolive', 'Comcast', 'ConocoPhillips', 'Costco', 'Danaher_Corporation', 'Dow_Inc.', 'DuPont', 'Duke_Energy', 'Eli_Lilly_and_Company', 'Emerson_Electric', 'Exelon', 'ExxonMobil', 'FedEx', 'Ford_Motor_Company', 'General_Dynamics', 'General_Electric', 'General_Motors', 'Gilead_Sciences', 'Goldman_Sachs', 'Honeywell', 'IBM', 'Intel', 'JPMorgan_Chase', 'Johnson_%26_Johnson', 'Kraft_Heinz', 'Linde_plc', 'Lockheed_Martin', 'Lowe%27s', 'Mastercard', 'McDonald%27s', 'Medtronic', 'Merck_%26_Co.', 'MetLife', 'Meta

In [60]:
def get_company_text_multiple(urls):
    text = []
    for url in urls:
        r = requests.get(url)
        soup = BeautifulSoup(r.content, 'html.parser')
        paragraphs = soup.find_all("p")
        paragraphs = [paragraph.text for paragraph in paragraphs]
        paragraphs = " ".join(paragraphs)
        pattern = "\[[0-9]+\]"
        paragraphs = re.sub(pattern, "", paragraphs)
        stemmer = nltk.stem.SnowballStemmer('english')
        paragraphs = stemmer.stem(paragraphs)
        text.append(paragraphs)
    vectorizer = CountVectorizer(stop_words='english')
    counts = vectorizer.fit_transform(text)
    counts = pd.DataFrame(counts.toarray(),columns=vectorizer.get_feature_names()).transpose()
    return counts
counts = get_company_text_multiple(links_unique)
counts.columns = index
print(counts)

          3M  AIG  AT%26T  AbbVie  Abbott_Laboratories  Accenture  Adobe_Inc.  \
00         0    0       0       0                    0          0           0   
000        9    0       5       2                    1          4           8   
000791     0    0       0       0                    0          0           0   
000th      0    0       0       0                    0          0           0   
005        0    0       0       0                    0          0           0   
...       ..  ...     ...     ...                  ...        ...         ...   
токобанк   0    0       0       0                    0          0           0   
富士康        0    0       0       0                    0          0           0   
小刚学长       0    0       0       0                    0          0           0   
沃尔玛        0    0       0       0                    0          0           0   
西友         0    0       0       0                    0          0           0   

          Alphabet_Inc.  Al

In [61]:
#Save down the data for future use
counts.to_csv("Company Words.csv")

In [62]:
#Find the most common words
print(counts.sum(axis=1).sort_values(ascending=False).head(100))

company       4286
million       1621
billion       1512
new           1422
announced     1342
              ... 
revenue        328
public         325
division       322
operations     322
pay            321
Length: 100, dtype: int64


In [72]:
#Convert to frequency
word_frequency = counts.copy()
word_frequency = word_frequency / word_frequency.sum()
print(word_frequency)
word_frequency.columns

                3M  AIG    AT%26T    AbbVie  Abbott_Laboratories  Accenture  \
00        0.000000  0.0  0.000000  0.000000             0.000000    0.00000   
000       0.006061  0.0  0.001995  0.002381             0.000532    0.00382   
000791    0.000000  0.0  0.000000  0.000000             0.000000    0.00000   
000th     0.000000  0.0  0.000000  0.000000             0.000000    0.00000   
005       0.000000  0.0  0.000000  0.000000             0.000000    0.00000   
...            ...  ...       ...       ...                  ...        ...   
токобанк  0.000000  0.0  0.000000  0.000000             0.000000    0.00000   
富士康       0.000000  0.0  0.000000  0.000000             0.000000    0.00000   
小刚学长      0.000000  0.0  0.000000  0.000000             0.000000    0.00000   
沃尔玛       0.000000  0.0  0.000000  0.000000             0.000000    0.00000   
西友        0.000000  0.0  0.000000  0.000000             0.000000    0.00000   

          Adobe_Inc.  Alphabet_Inc.    Altria  Amaz

Index(['3M', 'AIG', 'AT%26T', 'AbbVie', 'Abbott_Laboratories', 'Accenture',
       'Adobe_Inc.', 'Alphabet_Inc.', 'Altria', 'Amazon_(company)',
       'American_Express', 'American_Tower', 'Amgen', 'Apple_Inc.',
       'BNY_Mellon', 'Bank_of_America', 'Berkshire_Hathaway', 'BlackRock',
       'Boeing', 'Booking_Holdings', 'Bristol_Myers_Squibb', 'Broadcom_Inc.',
       'CVS_Health', 'Capital_One', 'Caterpillar_Inc.',
       'Charles_Schwab_Corporation', 'Charter_Communications',
       'Chevron_Corporation', 'Cisco', 'Citigroup', 'Colgate-Palmolive',
       'Comcast', 'ConocoPhillips', 'Costco', 'Danaher_Corporation',
       'Dow_Inc.', 'DuPont', 'Duke_Energy', 'Eli_Lilly_and_Company',
       'Emerson_Electric', 'Exelon', 'ExxonMobil', 'FedEx',
       'Ford_Motor_Company', 'General_Dynamics', 'General_Electric',
       'General_Motors', 'Gilead_Sciences', 'Goldman_Sachs', 'Honeywell',
       'IBM', 'Intel', 'JPMorgan_Chase', 'Johnson_%26_Johnson', 'Kraft_Heinz',
       'Linde_plc', 'Lo

# Company Distances and Industry Distances

In [75]:
#We can use euclidean distance to see how far away two companies are in terms of words 
def findDist(company1,company2):
    return sum((company1-company2)**2)**.5
#print(findDist(word_frequency['The_Bank_of_New_York_Mellon'],word_frequency['JPMorgan_Chase_%26_Co.']))
print(findDist(word_frequency['The_Coca-Cola_Company'],word_frequency['JPMorgan_Chase']))
#print(findDist(word_frequency['Facebook'],word_frequency['The_Bank_of_New_York_Mellon']))
print(findDist(word_frequency['FedEx'],word_frequency['The_Coca-Cola_Company']))

0.09361727253111736
0.11299820434872253


In [76]:
#We can use itertools to find the combinations
from  itertools import combinations
combinations = list(combinations(word_frequency.columns,2))
print(combinations)

[('3M', 'AIG'), ('3M', 'AT%26T'), ('3M', 'AbbVie'), ('3M', 'Abbott_Laboratories'), ('3M', 'Accenture'), ('3M', 'Adobe_Inc.'), ('3M', 'Alphabet_Inc.'), ('3M', 'Altria'), ('3M', 'Amazon_(company)'), ('3M', 'American_Express'), ('3M', 'American_Tower'), ('3M', 'Amgen'), ('3M', 'Apple_Inc.'), ('3M', 'BNY_Mellon'), ('3M', 'Bank_of_America'), ('3M', 'Berkshire_Hathaway'), ('3M', 'BlackRock'), ('3M', 'Boeing'), ('3M', 'Booking_Holdings'), ('3M', 'Bristol_Myers_Squibb'), ('3M', 'Broadcom_Inc.'), ('3M', 'CVS_Health'), ('3M', 'Capital_One'), ('3M', 'Caterpillar_Inc.'), ('3M', 'Charles_Schwab_Corporation'), ('3M', 'Charter_Communications'), ('3M', 'Chevron_Corporation'), ('3M', 'Cisco'), ('3M', 'Citigroup'), ('3M', 'Colgate-Palmolive'), ('3M', 'Comcast'), ('3M', 'ConocoPhillips'), ('3M', 'Costco'), ('3M', 'Danaher_Corporation'), ('3M', 'Dow_Inc.'), ('3M', 'DuPont'), ('3M', 'Duke_Energy'), ('3M', 'Eli_Lilly_and_Company'), ('3M', 'Emerson_Electric'), ('3M', 'Exelon'), ('3M', 'ExxonMobil'), ('3M', '

In [77]:
#Create the distance dataframe
distance = pd.DataFrame(combinations)
distance.columns = ["Company 1", "Company 2"]
#Create the distance for each combination
distance["Distance"] = distance.apply(lambda x: findDist(word_frequency[x["Company 1"]], word_frequency[x["Company 2"]]), axis=1)
print(distance)

                     Company 1            Company 2  Distance
0                           3M                  AIG  0.090671
1                           3M               AT%26T  0.069391
2                           3M               AbbVie  0.085404
3                           3M  Abbott_Laboratories  0.080801
4                           3M            Accenture  0.093700
...                        ...                  ...       ...
4945                 Visa_Inc.              Walmart  0.084946
4946                 Visa_Inc.          Wells_Fargo  0.097793
4947  Walgreens_Boots_Alliance              Walmart  0.085187
4948  Walgreens_Boots_Alliance          Wells_Fargo  0.103078
4949                   Walmart          Wells_Fargo  0.095554

[4950 rows x 3 columns]


In [78]:
#Find the closest distances
print(distance.sort_values(by="Distance"))

       Company 1                  Company 2  Distance
270       AT%26T         Procter_%26_Gamble  0.049968
223       AT%26T                  Citigroup  0.054381
2487   Citigroup             JPMorgan_Chase  0.054569
246       AT%26T             JPMorgan_Chase  0.059215
2511   Citigroup         Procter_%26_Gamble  0.059339
...          ...                        ...       ...
2900    Dow_Inc.             Morgan_Stanley  0.266923
2905    Dow_Inc.         Oracle_Corporation  0.267533
2922    Dow_Inc.             The_Home_Depot  0.267731
607   Adobe_Inc.                   Dow_Inc.  0.268298
2926    Dow_Inc.  Union_Pacific_Corporation  0.298321

[4950 rows x 3 columns]


In [79]:
#Let's reverse the columns and append so that we have both sides mirrored
temp = distance.copy()
temp.columns = ["Company 2", "Company 1", "Distance"]
distance = pd.concat([distance, temp], sort=False)
print(distance)

        Company 1                 Company 2  Distance
0              3M                       AIG  0.090671
1              3M                    AT%26T  0.069391
2              3M                    AbbVie  0.085404
3              3M       Abbott_Laboratories  0.080801
4              3M                 Accenture  0.093700
...           ...                       ...       ...
4945      Walmart                 Visa_Inc.  0.084946
4946  Wells_Fargo                 Visa_Inc.  0.097793
4947      Walmart  Walgreens_Boots_Alliance  0.085187
4948  Wells_Fargo  Walgreens_Boots_Alliance  0.103078
4949  Wells_Fargo                   Walmart  0.095554

[9900 rows x 3 columns]


In [80]:
#Let's see how Wells Fargo looks in terms of distance to other companies
print(distance[distance["Company 1"] == "Wells_Fargo"].sort_values(by="Distance"))

        Company 1                  Company 2  Distance
2534  Wells_Fargo                  Citigroup  0.080503
293   Wells_Fargo                     AT%26T  0.082387
3868  Wells_Fargo             JPMorgan_Chase  0.082884
4696  Wells_Fargo         Procter_%26_Gamble  0.086201
1379  Wells_Fargo                 BNY_Mellon  0.088772
...           ...                        ...       ...
4894  Wells_Fargo             The_Home_Depot  0.115610
671   Wells_Fargo                 Adobe_Inc.  0.117071
1121  Wells_Fargo             American_Tower  0.130811
4928  Wells_Fargo  Union_Pacific_Corporation  0.198566
2933  Wells_Fargo                   Dow_Inc.  0.262151

[99 rows x 3 columns]


In [81]:
#Let's grab industry classifications, first the infobox needs to be found
r = requests.get(links_unique[0])
soup = BeautifulSoup(r.content, 'html.parser')
infobox = soup.find("table", {"class": "infobox"})
infobox

<table class="infobox vcard"><caption class="infobox-title fn org" style="font-size: 125%;">3M Company</caption><tbody><tr><td class="infobox-image logo" colspan="2"><a class="image" href="/wiki/File:3M_wordmark.svg"><img alt="3M wordmark.svg" data-file-height="158" data-file-width="300" decoding="async" height="92" src="//upload.wikimedia.org/wikipedia/commons/thumb/1/15/3M_wordmark.svg/175px-3M_wordmark.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/1/15/3M_wordmark.svg/263px-3M_wordmark.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/1/15/3M_wordmark.svg/350px-3M_wordmark.svg.png 2x" width="175"/></a></td></tr><tr><td class="infobox-image logo" colspan="2"><a class="image" href="/wiki/File:3-M_Building_Maplewood_MN1.jpg"><img alt="3-M Building Maplewood MN1.jpg" data-file-height="2979" data-file-width="4644" decoding="async" height="160" src="//upload.wikimedia.org/wikipedia/commons/thumb/c/c9/3-M_Building_Maplewood_MN1.jpg/250px-3-M_Building_Maplewood_

In [82]:
#Within the text box we need to find the table header with text industry
print(infobox.find("th", text = "Industry"))

<th class="infobox-label" scope="row" style="padding-right: 0.5em;">Industry</th>


In [83]:
#And the parent of this
infobox.find("th", text = "Industry").parent()

[<th class="infobox-label" scope="row" style="padding-right: 0.5em;">Industry</th>,
 <td class="infobox-data category" style="line-height: 1.35em;"><a href="/wiki/Conglomerate_(company)" title="Conglomerate (company)">Conglomerate</a></td>,
 <a href="/wiki/Conglomerate_(company)" title="Conglomerate (company)">Conglomerate</a>]

In [84]:
#This returns an array and we will need to grab the second element
infobox.find("th", text = "Industry").parent()[1]

<td class="infobox-data category" style="line-height: 1.35em;"><a href="/wiki/Conglomerate_(company)" title="Conglomerate (company)">Conglomerate</a></td>

In [85]:
#Within the industries, the hyperlinks will have the industry classifications
industries = [x.text for x in infobox.find("th", text = "Industry").parent()[1].find_all('a')]
print(industries)

['Conglomerate']


In [86]:
#Turn it into a function
def get_company_industries(urls):
    industries_data = []
    for url in urls:
        r = requests.get(url)
        soup = BeautifulSoup(r.content, 'html.parser')
        infobox = soup.find("table", {"class": "infobox"})
        industries = [x.text for x in infobox.find("th", text = "Industry").parent()[1].find_all('a')]
        industries_data.append(industries)
    return industries_data
print(get_company_industries(links_unique[:5]))

[['Conglomerate'], ['Financial services'], ['Telecommunications', 'Technology'], ['Biopharmaceutical'], ['Health care', 'Medical devices', 'Pharmaceutical']]


In [87]:
#Instead of an array, let's modify to get a dataframe of dummy variables representing what industries each company is tagged with
def get_company_industries(urls):
    industries_data = []
    for url in urls:
        r = requests.get(url)
        soup = BeautifulSoup(r.content, 'html.parser')
        infobox = soup.find("table", {"class": "infobox"})
        industries = [x.text for x in infobox.find("th", text = "Industry").parent()[1].find_all('a')]
        industries = pd.Series(1, index=industries)
        industries_data.append(industries)
    industries_data = pd.concat(industries_data,axis=1,sort=False).fillna(0)
    return industries_data
print(get_company_industries(links_unique[:5]))

                      0    1    2    3    4
Conglomerate        1.0  0.0  0.0  0.0  0.0
Financial services  0.0  1.0  0.0  0.0  0.0
Telecommunications  0.0  0.0  1.0  0.0  0.0
Technology          0.0  0.0  1.0  0.0  0.0
Biopharmaceutical   0.0  0.0  0.0  1.0  0.0
Health care         0.0  0.0  0.0  0.0  1.0
Medical devices     0.0  0.0  0.0  0.0  1.0
Pharmaceutical      0.0  0.0  0.0  0.0  1.0


In [88]:
#And clean up with transposing and putting in the index of tickers
def get_company_industries(urls):
    industries_data = []
    for url in urls:
        r = requests.get(url)
        soup = BeautifulSoup(r.content, 'html.parser')
        infobox = soup.find("table", {"class": "infobox"})
        industries = [x.text for x in infobox.find("th", text = "Industry").parent()[1].find_all('a')]
        industries = pd.Series(1, index=industries)
        industries_data.append(industries)
    industries_data = pd.concat(industries_data,axis=1,sort=False).fillna(0)
    return industries_data
industries = get_company_industries(links_unique)
industries = industries.transpose()
industries.index = index
print(industries)

                          Conglomerate  Financial services  \
3M                                 1.0                 0.0   
AIG                                0.0                 1.0   
AT%26T                             0.0                 0.0   
AbbVie                             0.0                 0.0   
Abbott_Laboratories                0.0                 0.0   
...                                ...                 ...   
Verizon_Communications             0.0                 0.0   
Visa_Inc.                          0.0                 1.0   
Walgreens_Boots_Alliance           0.0                 0.0   
Walmart                            0.0                 0.0   
Wells_Fargo                        0.0                 1.0   

                          Telecommunications  Technology  Biopharmaceutical  \
3M                                       0.0         0.0                0.0   
AIG                                      0.0         0.0                0.0   
AT%26T            

In [89]:
#Which industries are the most common?
print(industries.sum().sort_values(ascending=False))

Financial services            16.0
Conglomerate                   6.0
Energy                         5.0
Retail                         5.0
Telecommunications             5.0
                              ... 
Medical equipment              1.0
Pharmaceutical industry        1.0
Social media                   1.0
Social network advertising     1.0
Shipbuilding                   1.0
Length: 102, dtype: float64


In [90]:
#We can see that an issue is Financial Services and financial services are listed differently
#We need to handle that by using lower on industries
def get_company_industries(urls):
    industries_data = []
    for url in urls:
        r = requests.get(url)
        soup = BeautifulSoup(r.content, 'html.parser')
        infobox = soup.find("table", {"class": "infobox"})
        industries = [x.text.lower() for x in infobox.find("th", text = "Industry").parent()[1].find_all('a')]
        industries = pd.Series(1, index=industries)
        industries_data.append(industries)
    industries_data = pd.concat(industries_data,axis=1,sort=False).fillna(0)
    return industries_data
industries = get_company_industries(links_unique)
industries = industries.transpose()
industries.index = index
print(industries)

                          conglomerate  financial services  \
3M                                 1.0                 0.0   
AIG                                0.0                 1.0   
AT%26T                             0.0                 0.0   
AbbVie                             0.0                 0.0   
Abbott_Laboratories                0.0                 0.0   
...                                ...                 ...   
Verizon_Communications             0.0                 0.0   
Visa_Inc.                          0.0                 1.0   
Walgreens_Boots_Alliance           0.0                 0.0   
Walmart                            0.0                 0.0   
Wells_Fargo                        0.0                 1.0   

                          telecommunications  technology  biopharmaceutical  \
3M                                       0.0         0.0                0.0   
AIG                                      0.0         0.0                0.0   
AT%26T            

In [91]:
#Now the industries are corrected
print(industries.sum().sort_values(ascending=False))

financial services         16.0
conglomerate                6.0
cloud computing             5.0
telecommunications          5.0
retail                      5.0
                           ... 
[1]                         1.0
[2]                         1.0
medical equipment           1.0
pharmaceutical industry     1.0
transportation              1.0
Length: 98, dtype: float64


In [92]:
#Filter to be only industries with at least 2 instances
industries = industries.loc[:,industries.sum() >= 2]

In [93]:
#Let's see which companies are in financial services
fin_services = industries[industries['financial services'] == 1].index
print(fin_services)

Index(['AIG', 'American_Express', 'BNY_Mellon', 'Bank_of_America',
       'Capital_One', 'Caterpillar_Inc.', 'Charles_Schwab_Corporation',
       'Citigroup', 'Goldman_Sachs', 'JPMorgan_Chase', 'Mastercard', 'MetLife',
       'Morgan_Stanley', 'U.S._Bank', 'Visa_Inc.', 'Wells_Fargo'],
      dtype='object')


In [94]:
#Now that we know which companies are in, we can make two indices
#First, the index for whenever both companies are in financial services
fin_services_index = distance['Company 1'].isin(fin_services) & distance['Company 2'].isin(fin_services)
#And then the second index for combinations where exactly one company is in financial services
fin_services_index2 = distance['Company 1'].isin(fin_services) & ~distance['Company 2'].isin(fin_services)
print(fin_services_index)

0       False
1       False
2       False
3       False
4       False
        ...  
4945    False
4946     True
4947    False
4948    False
4949    False
Length: 9900, dtype: bool


In [95]:
#Let's check how similar companies are within and outside of the financials industry
print(distance.loc[fin_services_index]["Distance"].mean())
print(distance.loc[fin_services_index2]["Distance"].mean())

0.0871581197172616
0.09670214666100681


In [96]:
#And check how different industries line up
#First create the base of the dataframe, each combination of industry
from  itertools import combinations
industry_distances = pd.DataFrame(list(combinations(industries.columns,2)))
industry_distances.columns = ["Industry 1", "Industry 2"]
print(industry_distances)

               Industry 1              Industry 2
0            conglomerate      financial services
1            conglomerate      telecommunications
2            conglomerate              technology
3            conglomerate          pharmaceutical
4            conglomerate  information technology
..                    ...                     ...
698  information security        renewable energy
699  information security     enterprise software
700              beverage        renewable energy
701              beverage     enterprise software
702      renewable energy     enterprise software

[703 rows x 2 columns]


In [97]:
#Create a function which finds average distance between chosen industries
def find_industry_distance(key1,key2,distance):
    firms1 = industries[industries[key1] == 1].index
    firms2 = industries[industries[key2] == 1].index
    i1 = distance['Company 1'].isin(firms1) & distance['Company 2'].isin(firms2)
    i2 = distance['Company 1'].isin(firms2) & distance['Company 2'].isin(firms1)
    i = i1 | i2
    return distance.loc[i]["Distance"].mean()

print(find_industry_distance('conglomerate','telecommunications',distance))

0.08787884964410483


In [None]:
#Get the distances 
industry_distances["Distance"] = industry_distances.apply(lambda x: find_industry_distance(x["Industry 1"],x["Industry 2"],distance), axis=1)

In [None]:
#Drop null values in place
industry_distances.dropna(inplace=True)

In [None]:
#And mirror like we did before
temp = industry_distances.copy()
temp.columns =  ["Industry 2", "Industry 1", "Distance"]
industry_distances = pd.concat([industry_distances,temp],sort=False)
print(industry_distances)

In [None]:
#Find the most similar
print(industry_distances.sort_values(by="Distance"))

In [None]:
print(industry_distances.sort_values(by='Distance').dropna())

In [None]:
#What about the most similar to financial technology?
print(industry_distances[industry_distances["Industry 1"] == 'financial services'].sort_values(by='Distance').dropna())

In [None]:
#Grab the top 5 similar industries
top_ind = industry_distances[industry_distances["Industry 1"] == 'financial services'].sort_values(by='Distance').dropna()["Industry 2"].values[:5]
print(top_ind)

In [None]:
#Add in financial services since that's our base
top_ind = ['financial services']+list(top_ind)

In [None]:
#Pivot
pivot_data = industry_distances.pivot("Industry 1", "Industry 2", "Distance").reindex(index=top_ind, columns=top_ind)
print(pivot_data)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
#Plot the heatmap
sns.heatmap(pivot_data)
plt.show()

In [None]:
#Change the color
sns.heatmap(pivot_data, cmap='YlGnBu')
plt.show()

In [None]:
#Annotate
sns.heatmap(pivot_data, cmap='YlGnBu', annot=True)
plt.show()