In [1]:
from bs4 import BeautifulSoup
import requests
import shutil
import re
from collections import defaultdict
import pandas as pd

### Defining functions

In [2]:
def retrive_data(driver):
    return BeautifulSoup(driver.page_source, 'lxml')

In [3]:
def get_links(soup, url=""):
    href_list = []
    
    if isinstance(soup, list):
        for row in soup:
            href = row.find('a')['href']
            href_list.append(url + href)
    else:
        href_tags = soup.find_all("a")
        for tag in href_tags:
            href = tag.get('href')
            href_list.append(url + href)
    return href_list

In [4]:
def get_image(soup, url, dir_path):
    
    image_list = []
    # Find all image on html code
    if isinstance(soup, list):
        for row in soup:
            image_url = row.find('img')
            image_list.append(image_url['src'])
    else:
        images_tags = soup.find_all("img")
        for image in images_tags:
            image_url = image['src']
            image_list.append(image_url)
    
    # Iter over each one
    for image in image_list:
        
        link = url + image
        
        # Download file
        filename = image.split('/')[-1]
        r = requests.get(link, stream=True)
        if r.status_code == 200:
            r.raw.decode_content = True
            with open(dir_path + filename, "wb") as f:
                shutil.copyfileobj(r.raw, f)
                print("[*] Downloaded Image: %s" % filename)
        else:
            print("[*] ERROR - Download Image: %s" % link)
    return 0

In [45]:
def get_tables(soup, text=False):
    table_dict_list = []
    
    tables = soup.find_all('table')
    
    for table in tables:
        table_dict = defaultdict(list)
        
        
        headers = [header.get_text() for header in table.find_all('th')]
        
        if len(headers) == 0:
            pass
        else:
            
            if text:
                rows = [[data.get_text() for data in col.find_all('td')] for col in table.find_all('tr')]
            else:
                rows = [[data for data in col.find_all('td')] for col in table.find_all('tr')]

            for row in rows:
                columns_id = 0
                for data in row:
                    column_name = headers[columns_id]

                    table_dict[column_name].append(data)
                    columns_id += 1

            table_dict_list.append(table_dict)
        
    return table_dict_list

In [6]:
def get_rows_names(soup):
    names_list = []
    
    if isinstance(soup, list):
        for row in soup:
            value = row.string
            names_list.append(value)
    else:
        names_tags = soup.find_all("td")
        for tag in names_tags:
            value = tag.string
            names_list.append(value)
    return names_list

In [7]:
def filter_column(table, column_name):
    if column_name in table:
        return table[column_name]
    return None

In [8]:
def create_dataframe(table):
    return pd.DataFrame.from_dict(table)

In [9]:
def select_column(df, column_names):
    if all(col in df for col in column_names):
        return df[column_names]
    return None

In [10]:
def rename_column(df, column_names, new_column_names):
    if len(column_names) != len(new_column_names):
        return None
    
    if all(col in df for col in column_names):
        columns_dict = dict()
        for i in range(len(column_names)):
            columns_dict[column_names[i]] = new_column_names[i]

        return df.rename(columns=columns_dict)
    
    return None

### Main code

In [14]:
filename = "/home/viviane/GravLens/Scraping/masterlens/masterlens_1.html"

soup = BeautifulSoup(open(filename), "lxml")

In [17]:
soup.div

<div id="lensWrapper"><!-- begin header -->
<div id="header"><a class="header"></a><table border="0" cellpadding="0" cellspacing="0"><tbody><tr height="50px"><td></td></tr><tr><td><a href="http://admin.masterlens.org/">The Master Lens Database</a></td></tr><tr><td><a href="http://admin.masterlens.org/">and The Orphan Lenses Project</a></td></tr></tbody></table></div>
<!-- end header --><!-- begin innerWrapper --><div id="wideWrapper"><!-- begin content -->
<div class="column" id="content" style="height: 1917px;">
<div>
<!-- begin main -->
<div id="main">
<form action="./export.php?" method="POST" name="lens" target="_blank"><input name="inputaction" type="hidden" value="Search"/><input name="mode" type="hidden" value=""/><input name="special" type="hidden" value=""/><input name="searchoption" type="hidden" value="basic"/><input name="do" type="hidden" value=""/><input name="hrefaction" type="hidden" value=""/><input name="status" type="hidden" value=""/><input name="substatus" type="hi

In [46]:
tables = get_tables(soup, True)

In [50]:
table_df = create_dataframe(tables[10])
table_df

Unnamed: 0,63 Lenses,90 Lenses,49 Lenses
0,Cardone et al. 2011 [MNRAS416(2011)1822],Secondary infall model and dark matter scaling...,
1,Auger et al. 2009 [ApJ705(2009)1099],"The Sloan Lens ACS Survey. IX. Colors, Lensing...",
2,Newton et al. 2009 [ApJ696(2009)1125],Enhanced Lensing Rate by Clustering of Massive...,
3,Grillo et al. 2009 [A&A501(2009)461],Photometric mass and mass decomposition in ear...,
4,Treu et al. 2009 [ApJ690(2009)670],The SLACS Survey. VIII. The Relation between E...,
5,Bolton et al. 2008 [ApJ682(2008)964],The Sloan Lens ACS Survey. V. The Full ACS Str...,Discovery Paper


In [63]:
table = soup.find('table')
print(table.prettify())


<table border="0" cellpadding="0" cellspacing="0">
 <tbody>
  <tr height="50px">
   <td>
   </td>
  </tr>
  <tr>
   <td>
    <a href="http://admin.masterlens.org/">
     The Master Lens Database
    </a>
   </td>
  </tr>
  <tr>
   <td>
    <a href="http://admin.masterlens.org/">
     and The Orphan Lenses Project
    </a>
   </td>
  </tr>
 </tbody>
</table>
