In [1]:
import requests
from bs4 import BeautifulSoup
# base URL from with the web scraping script will crall. It contains animal cell lines alphanumerically sorted
url = "http://www.lgcstandards-atcc.org/Products/Collections/Cell_Biology_Collections/Cell_Lines/Animal/Alphanumeric/"

In [2]:
# first level of crawling. Links ("a") to specific cell lines are surounded by "h3" tags
def parse_overview(url):
    req = requests.get(url)
    page = BeautifulSoup(req.text, "lxml")
    links = page.select("h3 a")
    return [l.get("href") for l in links]

In [3]:
parse_overview(url)

[None,
 '/en/Products/Collections/Cell_Biology_Collections/Cell_Lines/Animal/Alphanumeric/CRL-6468.aspx',
 '/en/Products/Collections/Cell_Biology_Collections/Cell_Lines/Animal/Alphanumeric/CRL-6469.aspx',
 '/en/Products/Collections/Cell_Biology_Collections/Cell_Lines/Animal/Alphanumeric/CRL-2037.aspx',
 '/en/Products/Collections/Cell_Biology_Collections/Cell_Lines/Animal/Alphanumeric/CRL-2652.aspx',
 '/en/Products/Collections/Cell_Biology_Collections/Cell_Lines/Animal/Alphanumeric/CRL-2817.aspx',
 '/en/Products/Collections/Cell_Biology_Collections/Cell_Lines/Animal/Alphanumeric/CRL-1666.aspx',
 '/en/Products/Collections/Cell_Biology_Collections/Cell_Lines/Animal/Alphanumeric/CRL-3422.aspx',
 '/en/Products/Collections/Cell_Biology_Collections/Cell_Lines/Animal/Alphanumeric/CRL-2649.aspx',
 '/en/Products/Collections/Cell_Biology_Collections/Cell_Lines/Animal/Alphanumeric/CRL-2650.aspx']

In [4]:
# first entry contains an invalid value, so we can skip it
links = parse_overview(url)[1:len(parse_overview(url))]

In [10]:
# second level of crawling. Once we have the list of links driving us to each cell line, we scrape their content
def parse_cell_line(req):
    page = BeautifulSoup(req.text, "lxml")
    # most of the features are contained in the first table of the page from the class "fulllist"
    table = page.select(".fulllist")[0]
    rows = table.select("tr")
    cell_line = {}
    # ATCC codes in particular are not in this table. Instead, they appear as titles ("h1") with the following id
    # page.select output is a list, in this case of only one element. The first (and only) element of the list is needed
    # to invoke method get_text()
    title = page.select("h1#layoutcontent_2_middlecontent_0_productdetailcontent_0_maincontent_0_heading")[0].get_text().strip()
    label = "ATCC code"
    value = title
    cell_line[label] = value
    for row in rows:
        label = row.select("th")[0].get_text().strip()
        value = row.select("td")[0].get_text().strip()
        cell_line[label] = value
    return cell_line

In [11]:
# the real scraping in action. The former methods are called, starting from the base URL so all the results are saved into
# the list cell_lines
def scrape_page(links):
    url = "http://www.lgcstandards-atcc.org/Products/Collections/Cell_Biology_Collections/Cell_Lines/Animal/Alphanumeric/"
    print(f"Scraping overview page {url}")
    #req = requests.get(url)
    #html = req.text
    #links = parse_overview(url)[1:parse_overview(url)]
    cell_lines = []

    for link in links:
        print(f"🖼 Parsing {link}")
        req = requests.get(f"http://www.lgcstandards-atcc.org/{link}")
        #print(f"http://www.lgcstandards-atcc.org/{link}")
        html = req.text
        cell_line = parse_cell_line(req)
        print(cell_line)
        cell_lines.append(cell_line)
        
    return cell_lines


In [12]:
scrape_page(links)

Scraping overview page http://www.lgcstandards-atcc.org/Products/Collections/Cell_Biology_Collections/Cell_Lines/Animal/Alphanumeric/
🖼 Parsing /en/Products/Collections/Cell_Biology_Collections/Cell_Lines/Animal/Alphanumeric/CRL-6468.aspx
{'ATCC code': '+/+ MGT  (ATCC® CRL-6468™)', 'Organism': 'Mus musculus, mouse', 'Tissue': 'mammary gland', 'Product Format': 'frozen', 'Culture Properties': 'adherent', 'Biosafety Level': '1 \r\n\nBiosafety classification is based on U.S. Public Health Service Guidelines, it is the responsibility of the customer to ensure that their facilities comply with biosafety regulations for their own country.', 'Age': 'adult', 'Gender': 'female', 'Strain': 'HRS/J'}
🖼 Parsing /en/Products/Collections/Cell_Biology_Collections/Cell_Lines/Animal/Alphanumeric/CRL-6469.aspx
{'ATCC code': '+/+ SCT  (ATCC® CRL-6469™)', 'Organism': 'Mus musculus, mouse', 'Tissue': 'connective and soft tissue', 'Product Format': 'frozen', 'Culture Properties': 'adherent', 'Biosafety Level

[{'ATCC code': '+/+ MGT  (ATCC® CRL-6468™)',
  'Organism': 'Mus musculus, mouse',
  'Tissue': 'mammary gland',
  'Product Format': 'frozen',
  'Culture Properties': 'adherent',
  'Biosafety Level': '1 \r\n\nBiosafety classification is based on U.S. Public Health Service Guidelines, it is the responsibility of the customer to ensure that their facilities comply with biosafety regulations for their own country.',
  'Age': 'adult',
  'Gender': 'female',
  'Strain': 'HRS/J'},
 {'ATCC code': '+/+ SCT  (ATCC® CRL-6469™)',
  'Organism': 'Mus musculus, mouse',
  'Tissue': 'connective and soft tissue',
  'Product Format': 'frozen',
  'Culture Properties': 'adherent',
  'Biosafety Level': '1 \r\n\nBiosafety classification is based on U.S. Public Health Service Guidelines, it is the responsibility of the customer to ensure that their facilities comply with biosafety regulations for their own country.',
  'Disease': 'cancer',
  'Age': 'newborn',
  'Strain': 'HRS/J',
  'Applications': 'This cell li

In [13]:
cell_lines = scrape_page(links)

Scraping overview page http://www.lgcstandards-atcc.org/Products/Collections/Cell_Biology_Collections/Cell_Lines/Animal/Alphanumeric/
🖼 Parsing /en/Products/Collections/Cell_Biology_Collections/Cell_Lines/Animal/Alphanumeric/CRL-6468.aspx
{'ATCC code': '+/+ MGT  (ATCC® CRL-6468™)', 'Organism': 'Mus musculus, mouse', 'Tissue': 'mammary gland', 'Product Format': 'frozen', 'Culture Properties': 'adherent', 'Biosafety Level': '1 \r\n\nBiosafety classification is based on U.S. Public Health Service Guidelines, it is the responsibility of the customer to ensure that their facilities comply with biosafety regulations for their own country.', 'Age': 'adult', 'Gender': 'female', 'Strain': 'HRS/J'}
🖼 Parsing /en/Products/Collections/Cell_Biology_Collections/Cell_Lines/Animal/Alphanumeric/CRL-6469.aspx
{'ATCC code': '+/+ SCT  (ATCC® CRL-6469™)', 'Organism': 'Mus musculus, mouse', 'Tissue': 'connective and soft tissue', 'Product Format': 'frozen', 'Culture Properties': 'adherent', 'Biosafety Level

In [14]:
# I forgot to import pandas before. The results are saved into a DataFrame object and then transformed into a CSV file
import pandas as pd
df = pd.DataFrame(cell_lines)
df.to_csv("ATCC_cell_lines.csv")