# Make funtion to scrape data

### Scrape table of content

In [75]:
import requests
from bs4 import BeautifulSoup

# URL of the page you want to scrape
url = 'https://plato.stanford.edu/contents.html'

# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML content of the page
soup = BeautifulSoup(response.content, 'html.parser')

# Find all <a> tags. You might need to be more specific based on the page structure
article_links = soup.find_all('a')

articles = list()
for link in article_links:
    href = link.get('href')
    full_url = f'https://plato.stanford.edu/{href}'
    articles.append(full_url)  # Or do something else with the link

articles

['https://plato.stanford.edu/index.html',
 'https://plato.stanford.edu/index.html',
 'https://plato.stanford.edu/#',
 'https://plato.stanford.edu/contents.html',
 'https://plato.stanford.edu/new.html',
 'https://plato.stanford.edu/https://plato.stanford.edu/cgi-bin/encyclopedia/random',
 'https://plato.stanford.edu/published.html',
 'https://plato.stanford.edu/archives/',
 'https://plato.stanford.edu/#',
 'https://plato.stanford.edu/info.html',
 'https://plato.stanford.edu/about.html',
 'https://plato.stanford.edu/board.html',
 'https://plato.stanford.edu/cite.html',
 'https://plato.stanford.edu/special-characters.html',
 'https://plato.stanford.edu/tools/',
 'https://plato.stanford.edu/contact.html',
 'https://plato.stanford.edu/#',
 'https://plato.stanford.edu/support/',
 'https://plato.stanford.edu/support/friends.html',
 'https://plato.stanford.edu/support/donate.html',
 'https://plato.stanford.edu/support/sepia.html',
 'https://plato.stanford.edu/#a',
 'https://plato.stanford.edu/

### Scrape article body

In [241]:
import requests
from bs4 import BeautifulSoup

def extract_body(url):
    """
    Extracts the body of an article from the given URL.

    Parameters:
    url (str): The URL of the article to scrape.

    Returns:
    str: The cleaned text of the article.
    """
    try:
        # Send a GET request to the URL
        response = requests.get(url)

        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the main content of the article
        article_body = soup.find('div', {'id': 'main-text'})
        
        # Extract text and replace newline characters with a space
        article_text = ' '.join(article_body.stripped_strings) if article_body else 'Content not found'
        article_text = article_text.replace('\n',' ')
        
        return article_text

    except Exception as e:
        return f"An error occurred: {e}"

# Example usage:
url = 'https://plato.stanford.edu/entries/abduction/'
text = extract_body(url)
text

'1. Abduction: The General Idea You happen to know that Tim and Harry have recently had a terrible row that ended their friendship. Now someone tells you that she just saw Tim and Harry jogging together. The best explanation for this that you can think of is that they made up. You conclude that they are friends again. One morning you enter the kitchen to find a plate and cup on the table, with breadcrumbs and a pat of butter on it, and surrounded by a jar of jam, a pack of sugar, and an empty carton of milk. You conclude that one of your house-mates got up at night to make him- or herself a midnight snack and was too tired to clear the table. This, you think, best explains the scene you are facing. To be sure, it might be that someone burgled the house and took the time to have a bite while on the job, or a house-mate might have arranged the things on the table without having a midnight snack but just to make you believe that someone had a midnight snack. But these hypotheses strike yo

### scrape biliography list

In [209]:
import requests
from bs4 import BeautifulSoup

def extract_bibliography(url):
    """
    Extracts the bibliography from the given URL.

    Parameters:
    url (str): The URL of the article to scrape.

    Returns:
    list: A list of bibliography entries.
    """
    try:
        # Send a GET request to the URL
        response = requests.get(url)

        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the bibliography
        bibliography = soup.find('div', {'id': 'bibliography'})
        bibliography_entries = []
        if bibliography:
            # Extract text from each <li> tag within the bibliography section
            for li in bibliography.find_all('li'):
                bibliography_entries.append(' '.join(li.stripped_strings))

        return bibliography_entries

    except Exception as e:
        return f"An error occurred: {e}"

# Example usage
url = 'https://plato.stanford.edu/entries/abduction/'
bibliography_entries = bibliography_extract(url)
pd.DataFrame(bibliography_entries)

Unnamed: 0,0
0,"Achinstein, P., 2001. The Book of Evidence , O..."
1,"Adler, J., 1994. “Testimony, Trust, Knowing,” ..."
2,"Bach, K. and Harnish, R., 1979. Linguistic Com..."
3,"Bird, A., 1998. Philosophy of Science , London..."
4,"Bigelow, J., 2010. “Quine, Mereology, and Infe..."
...,...
106,"–––, 2005. “The Refutation of\nSkepticism,” in..."
107,"Weintraub, R., 2013. “Induction and Inference ..."
108,"Weisberg, J., 2009. “Locating IBE in the Bayes..."
109,"Williams, J. and Lombrozo, T., 2010. “The Role..."


In [187]:
cite_len = []
for i in range(len(bibliography_entries)):
    cite_len.append(len(bibliography_entries[i]))

In [205]:
bib = pd.DataFrame(bibliography_entries)
bib.rename(columns = {0:'citation'}, inplace=True)
bib['char_len']= cite_len
print(bib.iloc[0:10])

                                            citation  char_len
0  Achinstein, P., 2001. The Book of Evidence , O...        77
1  Adler, J., 1994. “Testimony, Trust, Knowing,” ...        82
2  Bach, K. and Harnish, R., 1979. Linguistic Com...        99
3  Bird, A., 1998. Philosophy of Science , London...        58
4  Bigelow, J., 2010. “Quine, Mereology, and Infe...       112
5  Bovens, L. and Hartmann, S., 2003. “Solving th...        90
6  Boyd, R., 1981. “Scientific Realism and Natura...       189
7  –––, 1984. “The Current Status of\nScientific ...       151
8  –––, 1985. “Lex Orandi est Lex\nCredendi,” in ...       151
9  Brem, S. and Rips, L. J., 2000. “Explanation a...       113


# Use Funtions

In [292]:
body_list = list()
bibliography_list = list()
topic_list = list()
url_list = list()

for url in set(articles[48:]):
    body = extract_body(url)
    body_list.append(body)
    
    biblio = extract_bibliography(url)
    bibliography_list.append(biblio)
    
    extract_topic = url.rsplit('/')[-2]
    topic_list.append(extract_topic)
    
    url_list.append(url)

In [294]:
# Make the data frame 
df = pd.DataFrame({
    'url': url_list,
    'topic': topic_list,
    'body': body_list,
    'bibliography': bibliography_list
                  })

In [323]:
from concurrent.futures import ThreadPoolExecutor

def scrape_url(url):
    body = extract_body(url)
    bibliography = extract_bibliography(url)
    topic = url.rsplit('/')[-2]
    return {'url': url, 'topic': topic, 'body': body, 'bibliography': bibliography}

# List of URLs to scrape
urls = set(articles[48:])

# Using ThreadPoolExecutor to fetch data concurrently
with ThreadPoolExecutor(max_workers=10) as executor:
    results = list(executor.map(scrape_url, urls))

# Convert list of dictionaries to DataFrame
df = pd.DataFrame(results)

In [342]:
# Convert list of dictionaries to DataFrame
df = pd.DataFrame(results)


Unnamed: 0,url,topic,body,bibliography
0,https://plato.stanford.edu/entries/kant-leibniz/,kant-leibniz,1. Introduction Leibniz and his follower Chris...,[References to Leibniz’s texts are to C.I. Ger...
1,https://plato.stanford.edu/entries/james/,james,1. Chronology of James’s Life 1842. Born in Ne...,"[The Works of William James , Cambridge, MA an..."
2,https://plato.stanford.edu/entries/heytesbury/,heytesbury,1. Life and Works William Heytesbury was most ...,"[[CO] Casus\nobligationis, [DML] De motu\nloca..."
3,https://plato.stanford.edu/entries/latin-ameri...,latin-american-analytic,1. Geographical and Theoretical Boundaries Thi...,"[Análisis Filosófico (vol. 13, no. 1, 1993).\n..."
4,https://plato.stanford.edu/entries/madhyamaka/,madhyamaka,1. Issues in the Madhyamaka school There a num...,"[Ames, William L., 1993. Bhāvaviveka’s Prajñāp..."
...,...,...,...,...
1814,https://plato.stanford.edu/entries/critical-th...,critical-thinking,1. History Use of the term ‘critical thinking’...,"[Abrami, Philip C., Robert M. Bernard, Eugene ..."
1815,https://plato.stanford.edu/entries/sylvan-rout...,sylvan-routley,"1. Life Sylvan was born in 1935 in Levin, New ...","[Brady, R. and Routley, R., 1973, “Don't Care ..."
1816,https://plato.stanford.edu/entries/hermeneutics/,hermeneutics,1. Interpretive Experience The topic of this a...,"[Alcoff, Linda Martín, 2006, Visible Identitie..."
1817,https://plato.stanford.edu/board.html,plato.stanford.edu,Content not found,[]


In [341]:
exclude_urls= df[df['bibliography'].apply(lambda x: x==[])]['url']

96        https://plato.stanford.edu/support/friends.html
210        https://plato.stanford.edu/support/donate.html
224                https://plato.stanford.edu/info.html#c
513                  https://plato.stanford.edu/info.html
527                     https://plato.stanford.edu/tools/
573     https://plato.stanford.edu/https://mally.stanf...
607             https://plato.stanford.edu/published.html
799     https://plato.stanford.edu/entries/memory-epis...
801               https://plato.stanford.edu/mirrors.html
869         https://plato.stanford.edu/support/sepia.html
906                  https://plato.stanford.edu/archives/
941              https://plato.stanford.edu/contents.html
1032                https://plato.stanford.edu/about.html
1111    https://plato.stanford.edu/https://plato.stanf...
1129     https://plato.stanford.edu/entries/rights-group/
1149    https://plato.stanford.edu/projected-contents....
1153    https://plato.stanford.edu/special-characters....
1286          