# 1. Install, Imports, Settings

In [57]:
#!C:\Users\Admin\anaconda3\python.exe -m pip install --upgrade pip
#!pip install beautifulsoup4

In [58]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv

In [91]:
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

# 2. Read entries using Table of Contents page

In [59]:
BASIC_URL = "https://plato.stanford.edu/entries/"
page = requests.get("https://plato.stanford.edu/archives/spr2022/contents.html")
print(page.status_code)  #200 is ok

200


In [60]:
soup = BeautifulSoup(page.content, 'html.parser')

In [61]:
print(soup.prettify())

<!DOCTYPE html>
<!--[if lt IE 7]> <html class="ie6 ie"> <![endif]-->
<!--[if IE 7]>    <html class="ie7 ie"> <![endif]-->
<!--[if IE 8]>    <html class="ie8 ie"> <![endif]-->
<!--[if IE 9]>    <html class="ie9 ie"> <![endif]-->
<!--[if !IE]> -->
<html>
 <!-- <![endif]-->
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <title>
   Table of Contents (Stanford Encyclopedia of Philosophy/Spring 2022 Edition)
  </title>
  <!-- NOTE: Import webfonts using this link: -->
  <link href="https://fonts.googleapis.com/css?family=Source+Sans+Pro:400,300,600,200&amp;subset=latin,latin-ext" rel="stylesheet" type="text/css"/>
  <link href="css/bootstrap.min.css" media="screen,handheld" rel="stylesheet" type="text/css"/>
  <link href="css/bootstrap-responsive.min.css" media="screen,handheld" rel="stylesheet" type="text/css"/>
  <link href="css/font-awesome.min.css" rel="stylesheet" type="text/css"/

In [62]:
entries = []
for li in soup.find_all('li'):
    if li.a is None:
        continue
    if li.a['href'].startswith('entries/'):
        entries.append(li)
len(entries)

2535

## 2.1 Testing entries' information

In [63]:
entry = entries[10]

print(entry.text)
print(entry)

alias = entry.text.split("— see")[0] if "— see" in entry.text else ""
id = entry.a['href'].replace('entries/', '').replace('/', '')
url = entry.a['href'].replace('entries/', BASIC_URL)
html_page = requests.get(url)

if html_page.status_code == 200:
    soup = BeautifulSoup(html_page.content, 'html.parser')
    title = soup.find('h1').text
    html_content = soup.prettify()
    
print(id, title, url, alias)
    
#links
links = soup.find("div", {"id": "related-entries"})
for link in links.p.find_all('a'):
    print(id, link['href'].replace('../', '').replace('/', ''))

 logic of — see logic: action 
<li> logic of — see <a href="entries/logic-action/">logic: action</a> </li>
logic-action The Logic of Action https://plato.stanford.edu/entries/logic-action/  logic of 
logic-action events
logic-action frame-problem
logic-action dynamic-epistemic
logic-action logic-nonmonotonic
logic-action logic-dynamic
logic-action logic-temporal
logic-action dynamic-semantics
logic-action situations-semantics
logic-action speech-acts


# 3. Extract entries's information and their relations

In [64]:
nodes = [] #id, title, url, alias
edges = []

for entry in entries:
    alias = entry.text.split("— see")[0] if "— see" in entry.text else ""
    id = entry.a['href'].replace('entries/', '').replace('/', '')
    url = entry.a['href'].replace('entries/', BASIC_URL)
    html_page = requests.get(url)
    if html_page.status_code == 200:
        soup = BeautifulSoup(html_page.content, 'html.parser')
        title = soup.find('h1').text
        nodes.append([id, title, url, alias])
        
        #links
        links = soup.find("div", {"id": "related-entries"})
        for link in links.p.find_all('a'):
            edges.append([id, link['href'].replace('../', '').replace('/', '')])
                         
print (len(nodes))
print (len(edges))

2535
25988


In [65]:
dfn = pd.DataFrame(nodes, columns=["ID", "title", "url", "known_as"])
dfe = pd.DataFrame(edges, columns=["IDSource", "IDTarget"])

In [66]:
dfn.head(10)

Unnamed: 0,ID,title,url,known_as
0,abduction,Abduction,https://plato.stanford.edu/entries/abduction/,
1,abelard,Peter Abelard,https://plato.stanford.edu/entries/abelard/,
2,abhidharma,Abhidharma,https://plato.stanford.edu/entries/abhidharma/,
3,abilities,Abilities,https://plato.stanford.edu/entries/abilities/,
4,abner-burgos,Abner of Burgos,https://plato.stanford.edu/entries/abner-burgos/,
5,abrabanel,Judah Abrabanel,https://plato.stanford.edu/entries/abrabanel/,
6,abstract-objects,Abstract Objects,https://plato.stanford.edu/entries/abstract-ob...,
7,essential-accidental,Essential vs. Accidental Properties,https://plato.stanford.edu/entries/essential-a...,accidental properties
8,action,Action,https://plato.stanford.edu/entries/action/,action (George Wilson and Samuel Shpall)\n ...
9,shared-agency,Shared Agency,https://plato.stanford.edu/entries/shared-agency/,joint


In [67]:
dfe.head()

Unnamed: 0,IDSource,IDTarget
0,abduction,epistemology-bayesian
1,abduction,induction-problem
2,abduction,peirce
3,abduction,scientific-explanation
4,abduction,scientific-realism


## 3.1 Delete duplicated edges

In [68]:
#delete duplicates edges
dfe = dfe.groupby(list(dfe.columns)).count().reset_index()
len(dfe)

18075

## 3.2 Merging duplicated nodes

In [69]:
dfn = dfn.groupby(['ID', 'title', 'url'])['known_as'].apply(list).to_frame()
dfn.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,known_as
ID,title,url,Unnamed: 3_level_1
18thGerman-preKant,18th Century German Philosophy Prior to Kant,https://plato.stanford.edu/entries/18thGerman-preKant/,"[, ]"
abduction,Abduction,https://plato.stanford.edu/entries/abduction/,"[, inference to the best explanation ]"
abelard,Peter Abelard,https://plato.stanford.edu/entries/abelard/,[]
abhidharma,Abhidharma,https://plato.stanford.edu/entries/abhidharma/,[]
abilities,Abilities,https://plato.stanford.edu/entries/abilities/,[]


In [70]:
len(dfn)

1750

In [71]:
dfn = dfn.reset_index()
dfn.head()

Unnamed: 0,ID,title,url,known_as
0,18thGerman-preKant,18th Century German Philosophy Prior to Kant,https://plato.stanford.edu/entries/18thGerman-...,"[, ]"
1,abduction,Abduction,https://plato.stanford.edu/entries/abduction/,"[, inference to the best explanation ]"
2,abelard,Peter Abelard,https://plato.stanford.edu/entries/abelard/,[]
3,abhidharma,Abhidharma,https://plato.stanford.edu/entries/abhidharma/,[]
4,abilities,Abilities,https://plato.stanford.edu/entries/abilities/,[]


In [72]:
aliasses = list(dfn['known_as'])
new_al = []
for a in aliasses:
    a = list(filter(None, a))
    a = [s.split("\n   \n ")[1].strip() if "\n   \n " in s else s.strip() for s in a]
    a = list(set(a))   
    new_al.append(a)
    
len(aliasses), len(new_al)

(1750, 1750)

In [73]:
dfn['known_as_'] = new_al

## 3.3 Save nodes and edges in csv files

In [74]:
dfe.to_csv("standford_links.csv",  sep='\t', index=False)
dfn.to_csv("standford_entries.csv", sep='\t', index = False)

# 4. Save page content by ID

In [76]:
#content 
def  get_content(url):
    html_page = requests.get(url)

    if html_page.status_code == 200:
        soup = BeautifulSoup(html_page.content, 'html.parser')
        return soup.prettify()
    return ""

dfc = dfn[['ID', 'url']]
dfc['content'] = dfc['url'].apply(get_content)
dfc.to_csv("standford_content.csv",  sep='\t', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfc['content'] = dfc['url'].apply(get_content)


In [77]:
dfc.head()

Unnamed: 0,ID,url,content
0,18thGerman-preKant,https://plato.stanford.edu/entries/18thGerman-...,<!DOCTYPE html>\n<!--[if lt IE 7]> <html class...
1,abduction,https://plato.stanford.edu/entries/abduction/,<!DOCTYPE html>\n<!--[if lt IE 7]> <html class...
2,abelard,https://plato.stanford.edu/entries/abelard/,<!DOCTYPE html>\n<!--[if lt IE 7]> <html class...
3,abhidharma,https://plato.stanford.edu/entries/abhidharma/,<!DOCTYPE html>\n<!--[if lt IE 7]> <html class...
4,abilities,https://plato.stanford.edu/entries/abilities/,<!DOCTYPE html>\n<!--[if lt IE 7]> <html class...


In [78]:
dfc[['ID', 'content']].to_csv("standford_content.csv",  sep='\t', index=False)

In [85]:
dtest = pd.read_csv("standford_entries.csv", header= 1)
dtest.head()

Unnamed: 0,ID,title,url,related_to
0,18thGerman-preKant,18th Century German Philosophy Prior to Kant,https://plato.stanford.edu/entries/18thGerman-...,[]
1,abduction,Abduction,https://plato.stanford.edu/entries/abduction/,['inference to the best explanation']
2,abelard,Peter Abelard,https://plato.stanford.edu/entries/abelard/,[]
3,abhidharma,Abhidharma,https://plato.stanford.edu/entries/abhidharma/,[]
4,abilities,Abilities,https://plato.stanford.edu/entries/abilities/,[]


# 5. Testing aliasses

In [87]:
list(dtest.related_to)

['[]',
 "['inference to the best explanation']",
 '[]',
 '[]',
 '[]',
 '[]',
 "['Leone Ebreo']",
 "['Daud, Abraham Ibn']",
 '[]',
 '[]',
 "['joint']",
 "['action-based theories']",
 "['metaphysics of']",
 "['actualism and possibilism']",
 '[]',
 '[]',
 '[]',
 "['living wills', 'surrogate decision-making for incompetent individuals', 'advance directives and substitute decision-making']",
 "['aesthetic value']",
 "['aesthetic']",
 '[]',
 '[]',
 '[]',
 '[]',
 "['aesthetics']",
 '[]',
 '[]',
 "['ethics, African']",
 '[]',
 '[]',
 '[]',
 "['immortality']",
 '[]',
 '[]',
 '[]',
 '[]',
 '[]',
 '[]',
 '[]',
 '[]',
 '[]',
 '[]',
 '[]',
 '[]',
 '[]',
 '[]',
 '[]',
 '[]',
 '[]',
 '[]',
 "['Alexander of Aphrosias']",
 '[]',
 '[]',
 '[]',
 '[]',
 '[]',
 '[]',
 '[]',
 '[]',
 "['altruism']",
 '[]',
 '[]',
 "['Ammonius']",
 "['analogy']",
 '[]',
 "['synthetic']",
 '[]',
 '[]',
 '[]',
 '[]',
 "['psyche']",
 '[]',
 '[]',
 '[]',
 "['anomalous']",
 '[]',
 '[]',
 '[]',
 '[]',
 "['a posteriori knowledge', '

In [89]:
dtest[dtest['title'] ==  'Chan Buddhism']

Unnamed: 0,ID,title,url,related_to
188,buddhism-chan,Chan Buddhism,https://plato.stanford.edu/entries/buddhism-chan/,"['Chan', 'Chan Buddhism (Peter Hershock) \n Ch..."


In [93]:
dtest = dtest[['ID', 'title', 'url']]
dtest.to_csv("standford_entries.csv",   index=False)

In [95]:
dtest = pd.read_csv("standford_entries.csv", header= 0)
dtest.head()

Unnamed: 0,ID,title,url
0,18thGerman-preKant,18th Century German Philosophy Prior to Kant,https://plato.stanford.edu/entries/18thGerman-preKant/
1,abduction,Abduction,https://plato.stanford.edu/entries/abduction/
2,abelard,Peter Abelard,https://plato.stanford.edu/entries/abelard/
3,abhidharma,Abhidharma,https://plato.stanford.edu/entries/abhidharma/
4,abilities,Abilities,https://plato.stanford.edu/entries/abilities/
