In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
df_list = pd.read_html("/home/viviane/GravLens/Scraping/masterlens/masterlens_41.html")  # list with dfs from tables

In [3]:
# important dfs: 5,6,7,11,12,13,14,16,17,18,19,20,21
for i in df_list:
    print(i.columns.values)

[0]
[0 1 2 3]
[0 1 2 3 4 5]
[0]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1 2 3]
['External Links:' 'External Links:.1']
['Added:' 'Added:.1']
[0 1]
[0 1 2 3]
[0 1 2]
['Unnamed: 0' 'Unnamed: 1' 'Unnamed: 2' 'Unnamed: 3']
[0 1 2 3 4 5 6 7]
['SDSS' 'Unnamed: 1' 'Filter' 'Band' 'QSO Magnitude' 'Flux [nmaggie]'
 'Reff [″]' 'axis ratio (AB)' 'PA [° E of N]' 'Unnamed: 9'
 'Program Observations']
['HST' 'Unnamed: 1' 'Filter' 'Band' 'Lens Magnitude' 'Flux [mJy]'
 'Reff [″]' 'axis ratio (AB)' 'PA [° E of N]' 'Source Magnitude'
 'Program Observations']
[0 1]
[0 1 2 3 4 5 6 7 8]


### Defining functions

In [4]:
def pick_table(key):
    for df in df_list:
        list_labels = list(df.columns.values)
        if any(isinstance(i, tuple) for i in list_labels):  # if labels is nested list
            labels = [item for t in list_labels for item in t]  # flatten
        else:
            labels = list_labels

        list_values = df.values.tolist()
        if any(isinstance(i, list) for i in list_values):  # if values is nested list
            values = [item for t in list_values for item in t]  # flatten
        else:
            values = list_values
              
        if key in values or key in labels:
            #df_list.remove(df)
            return df

In [5]:
def drop_nans(df):
    df.dropna(axis=0, how='all', inplace=True)
    df.dropna(axis=1, how='all', inplace=True)
    return None

In [6]:
def split_col(df, old_new_list):  # old_new_list = [[oldname, newname], ...]
    for pair in old_new_list:
        if pair[0] in df.columns:
            df[[pair[1], pair[1] + '_err']] = df[pair[0]].str.split("±", n = 1, expand = True)
            df.drop(pair[0], axis=1, inplace=True)
        else:
            pass
    return None

In [7]:
def get_table_links(table):
    href_list = []
    
    if isinstance(table, list):
        for row in table:
            href = row.find('a')['href']
            href_list.append(href)
    else:
        href_tags = table.find_all("a")
        for tag in href_tags:
            href = tag.get('href')
            href_list.append(href)
    return href_list

In [8]:
def save(df, filename, i=False):
    if len(df) > 0:
        df.to_csv("/home/viviane/GravLens/Scraping/results/" + filename, index = i)
    return None

### System table
### (5, 6, 7, 11, 12, 20)

In [9]:
def system():
    # joining 5,6,7,11
    frames =[pick_table('Discovery'), pick_table('Lens Kind'), pick_table('Lens Grade'), pick_table('Description')]
    result_df = pd.concat(frames)
    result_df.reset_index(drop=True, inplace=True)
    system_df = result_df.drop(columns=0).T
    system_df.columns = list(result_df[0])
    drop_nans(system_df)
    
    # Splitting value ± error columns
    split_col(system_df, [['Einstein_R ["]','Einstein_R'], ['z_Lens', 'z_lens'], ['z_Source(s)', 'z_source'],
                          ['Stellar velocity disp', 'Stellar_v_disp']])
    
    # adding other 2 tables
    date_df = pick_table('Discovery Date:')
    n_df = pick_table('Number of Source Plane Images')
    
    system_df['Discovery Date'] = date_df[3][0]
    system_df['Name'] = date_df[1][0]
    system_df['N Images'] = n_df[1][0]
    
    return system_df

In [10]:
system()

ValueError: Columns must be same length as key

### Coordinates table (13)

In [None]:
def coordinates():
    coords_df = pick_table('Coordinates:').drop(2, axis=1)  # Manual coordinates
    coords_df = coords_df.set_index(0).T
    coords_df['Coordinates:'] = 'Manual'
    return coords_df

In [None]:
coordinates()

### External links


In [None]:
def external():
    external_df = pick_table('External Links:')
    #external_df.columns = [''] * len(external_df.columns)
    external_df = external_df.set_index('External Links:').T
    drop_nans(external_df)
    external_df = external_df.T
    external_df.drop('External Links:.1', axis=1, inplace=True)
    

    tb = soup.find_all('table')[9]
    links = get_table_links(tb)
    
    external_df['links'] = links
    
    return external_df

In [11]:
external()

NameError: name 'external' is not defined

### Flux table (SDSS 16, HST 17)

In [12]:
def sdss():
    df = pick_table('Filter')
    if 'SDSS' in df.columns.values:
        sdss_df = df.set_index('Band')
        sdss_df = sdss_df.drop(columns=['Unnamed: 1'])
    else:
        sdss_df = pd.DataFrame()
    
    return sdss_df

In [13]:
def hst():
    hst_df = pick_table('HST')
    hst_df = hst_df.drop(columns=['Unnamed: 1'])
    hst_df = hst_df.set_index('Band')
    return hst_df

In [14]:
def flux():
    sdss_df = sdss()
    hst_df = hst()

    flux_df = pd.concat([sdss_df, hst_df])

    drop_nans(flux_df)
    split_col(flux_df, [['Lens Magnitude', 'lens_mag'], ['Flux [nmaggie]', 'Flux (nmaggie)'],
                    ['Reff [″]', "ref (arcsec)"], ['axis ratio (AB)', 'axis_ratio (AB)'],
                    ['PA [° E of N]','PA (deg)']])
    return flux_df

In [15]:
flux()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """


AttributeError: Can only use .str accessor with string values!

### Redshift table (18, 19)

In [16]:
def redshift():

    z_lens_df = pick_table('Lens Plane Images:')
    if z_lens_df is not None:
        z_lens_df = z_lens_df['Lens Plane Images:']
        #z_lens_df = z_lens_df.set_index('No.')
    else:
        z_lens_df = pd.DataFrame()

    z_source_df = pick_table('Source Plane Images:')
    if z_source_df is not None:
        z_source_df = z_source_df['Source Plane Images:']
        #z_source_df = z_source_df.set_index('No.')
    else:
        z_source_df = pd.DataFrame()

    z_df = pd.concat([z_lens_df, z_source_df], keys=['Lens', 'Source'])
    drop_nans(z_df)

    return z_df

In [17]:
redshift()

Unnamed: 0,Unnamed: 1


### Time delay table (21)

In [18]:
def time():
    time_df = pick_table('Time Delays:')['Time Delays:']
    return time_df

In [19]:
time()

TypeError: 'NoneType' object is not subscriptable

### References table (14)

In [106]:
def refs():
    soup = BeautifulSoup(open("/home/viviane/GravLens/Scraping/masterlens/masterlens_41.html"), "html.parser")
    
    for table in soup.find_all('table'):
    # links = [np.where(tag.has_attr('href'),tag.get('href'),"no link") for tag in tb.find_all('a')]

        links = get_table_links(table)
        #print(links)
        
        if len(links) >= 2 and 'citation' in links[1]:
            papers = links[1::2]
    
    ref_df = pick_table('Discovery\xa0Paper')
    ref_df = ref_df.drop('Unnamed: 0', axis=1)
    
    ref_df['Links'] = papers
    ref_df.columns = ['Author', 'Title', 'Discovery', 'Links']
    return ref_df

In [107]:
refs()

Unnamed: 0,Author,Title,Discovery,Links
0,Courbin et al. 2012 [A&A540(2012)A36],Three quasi-stellar objects acting as strong g...,Discovery Paper,http://admin.masterlens.org/citation.php?refer...


### Main code

In [22]:
system_df = system()
coords_df = coordinates()
flux_df = flux()
z_df = redshift()
ref_df = refs()
external_df = external()

ValueError: Columns must be same length as key

In [23]:
save(system_df, 'system.csv')
save(coords_df, 'coordinates.csv')
save(flux_df, 'flux.csv', i=True)
save(z_df, 'redshift.csv', i=True)
save(ref_df, 'references.csv')
save(external_df, 'external.csv',i=True)

NameError: name 'system_df' is not defined

In [24]:
pd.read_csv('results/external.csv')

Unnamed: 0,External Links:,links
0,SDSS,http://skyserver.sdss3.org/dr9/en/tools/chart/...
1,ADS,http://adsabs.harvard.edu/cgi-bin/nph-abs_conn...
2,NED,http://ned.ipac.caltech.edu/cgi-bin/objsearch?...
