In [618]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [619]:
df_list = pd.read_html("masterlens.html")  # list with dfs from tables

#for df in df_list:
#    print(df.columns.values)

In [620]:
# important dfs: 5,6,7,11,12,13,14,16,17,18,19,20,21

### Defining functions

In [621]:
def drop_nans(df):
    df.dropna(axis=0, how='all', inplace=True)
    df.dropna(axis=1, how='all', inplace=True)
    return None

In [622]:
def split_col(df, old_new_list):  # old_new_list = [[oldname, newname], ...]
    for pair in old_new_list:
        df[[pair[1], pair[1] + '_err']] = df[pair[0]].str.split("±", n = 1, expand = True)
        df.drop(pair[0], axis=1, inplace=True)
    return None

In [623]:
def get_table_links(table):
    href_list = []
    
    if isinstance(table, list):
        for row in table:
            href = row.find('a')['href']
            href_list.append(href)
    else:
        href_tags = table.find_all("a")
        for tag in href_tags:
            href = tag.get('href')
            href_list.append(href)
    return href_list

### System table
### (5, 6, 7, 11, 12, 20)

In [624]:
# joining 5,6,7,11
frames =[df_list[5], df_list[6], df_list[7], df_list[11]]
result_df = pd.concat(frames)
result_df.reset_index(drop=True, inplace=True)

system_df = result_df.drop(columns=0).T
system_df.columns = list(result_df[0])
system_df

Unnamed: 0,Discovery,Lens Kind,Lens Grade,"Einstein_R [""]",Einstein_R quality,z_Lens,z_Lens quality,z_Source(s),z_Source quality,Stellar velocity disp,Description
1,SLACS,GAL-GAL,A,1.16 ± 0.02,SIE model,0.4400 ± 0.0001,spectroscopic,1.192 ± 0.001,spectroscopic,232 ± 39,Luminous red galaxy lensing three background o...


In [625]:
# Splitting value ± error columns
split_col(system_df, [['Einstein_R ["]','Einstein_R'], ['z_Lens', 'z_lens'], ['z_Source(s)', 'z_source'],
                    ['Stellar velocity disp', 'Stellar_v_disp']])

In [626]:
# adding 12 and 20 to final
system_df['Discovery Date'] = df_list[12][3][0]
system_df['Name'] = df_list[12][1][0]
system_df['N Images'] = df_list[20][1][0]
system_df

Unnamed: 0,Discovery,Lens Kind,Lens Grade,Einstein_R quality,z_Lens quality,z_Source quality,Description,Einstein_R,Einstein_R_err,z_lens,z_lens_err,z_source,z_source_err,Stellar_v_disp,Stellar_v_disp_err,Discovery Date,Name,N Images
1,SLACS,GAL-GAL,A,SIE model,spectroscopic,spectroscopic,Luminous red galaxy lensing three background o...,1.16,0.02,0.44,0.0001,1.192,0.001,232,39,2008-08-01,SDSS J0008-0004,3


### Coordinates table (13)

In [627]:
coords_df = df_list[13].drop(2, axis=1)  # Manual coordinates
coords_df = coords_df.set_index(0).T

coords_df['Coordinates:'] = 'Manual'
coords_df

Unnamed: 0,Coordinates:,RA Hrs,RA Mins,RA Secs,RA [°],Dec Degrees,Dec Arcmin,Dec Arcsec,Dec [°]
1,Manual,0,8,2.96,2.01231,0,4,8.26,-0.06896


### Flux table (SDSS 16, HST 17)

In [628]:
sdss_df = df_list[16].set_index('Band')
sdss_df = sdss_df.drop(columns=['Unnamed: 1'])

In [629]:
hst_df = df_list[17]
hst_df = hst_df.drop(columns=['Unnamed: 1'])
hst_df = hst_df.set_index('Band')

In [630]:
flux_df = pd.concat([sdss_df, hst_df])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [631]:
drop_nans(flux_df)
split_col(flux_df, [['Lens Magnitude', 'lens_mag'], ['Flux [nmaggie]', 'Flux (nmaggie)'],
                    ['Reff [″]', "ref (arcsec)"], ['axis ratio (AB)', 'axis_ratio (AB)'],
                    ['PA [° E of N]','PA (deg)']])
flux_df

Unnamed: 0_level_0,Filter,HST,Program Observations,SDSS,lens_mag,lens_mag_err,Flux (nmaggie),Flux (nmaggie)_err,ref (arcsec),ref (arcsec)_err,axis_ratio (AB),axis_ratio (AB)_err,PA (deg),PA (deg)_err
Band,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
u,Sloan u,,Has Sloan ugriz photometrySDSS Spectrum: 669-5...,SDSS (DR8),22.37,0.53,1.11,0.56,5.29,4.33,0.05,0.0,40.44,0.01
g,Sloan g,,Has Sloan ugriz photometrySDSS Spectrum: 669-5...,SDSS (DR8),20.9,0.06,4.35,0.25,1.52,0.3,0.81,0.23,171.15,0.01
r,Sloan r,,Has Sloan ugriz photometrySDSS Spectrum: 669-5...,SDSS (DR8),19.58,0.03,14.78,0.44,1.0,0.1,0.88,0.12,17.35,0.01
i,Sloan i,,Has Sloan ugriz photometrySDSS Spectrum: 669-5...,SDSS (DR8),18.75,0.02,31.72,0.65,0.99,0.06,0.91,0.08,51.95,0.01
z,Sloan z,,Has Sloan ugriz photometrySDSS Spectrum: 669-5...,SDSS (DR8),18.41,0.06,43.3,2.51,1.15,0.18,0.65,0.18,70.26,0.01
I,F814W,ACS/WFC (WFC1),HST Program 10886 — PI: Bolton,,18.65,0.03,,,1.71,0.06,0.83,0.01,27.3,0.1
V,F606W,WFPC2 (WF3),HST Program 10886 — PI: Bolton,,,,,,,,,,,


### Redshift table (18, 19)

In [632]:
df18 = df_list[18]['Lens Plane Images:']
df18 = df18.set_index('No.')

In [633]:
df19 = df_list[19]['Source Plane Images:']
df19 = df19.set_index('No.')

In [634]:
z_df = pd.concat([df18,df19], keys=['Lens', 'Source'])
drop_nans(z_df)
z_df

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,Unnamed: 1_level_0,z,z_err
Unnamed: 0_level_1,No.,Unnamed: 2_level_1,Unnamed: 3_level_1
Lens,1,0.44,0.0001
Source,1,1.192,0.001
Source,2,1.192,0.001
Source,3,1.192,0.001


### Time delay table (21)

In [635]:
time_df = df_list[21]['Time Delays:']
time_df

Unnamed: 0,Image -- Image,Time Delay (days)
0,(1) -- (2),
1,(1) -- (3),
2,(2) -- (3),


### References table (14)

In [636]:
import numpy as np

soup = BeautifulSoup(open("masterlens.html"), "html.parser")
tb = soup.find_all('table')[14] 
# links = [np.where(tag.has_attr('href'),tag.get('href'),"no link") for tag in tb.find_all('a')]

links = get_table_links(tb)
papers = links[1::2]

In [637]:
ref_df = df_list[14].drop('Unnamed: 0', axis=1)
ref_df['Links'] = papers
ref_df.columns = ['Author', 'Title', 'Discovery', 'Links']
ref_df

Unnamed: 0,Author,Title,Discovery,Links
0,Cardone et al. 2011 [MNRAS416(2011)1822],Secondary infall model and dark matter scaling...,,http://admin.masterlens.org/citation.php?refer...
1,Auger et al. 2009 [ApJ705(2009)1099],"The Sloan Lens ACS Survey. IX. Colors, Lensing...",,http://admin.masterlens.org/citation.php?refer...
2,Newton et al. 2009 [ApJ696(2009)1125],Enhanced Lensing Rate by Clustering of Massive...,,http://admin.masterlens.org/citation.php?refer...
3,Grillo et al. 2009 [A&A501(2009)461],Photometric mass and mass decomposition in ear...,,http://admin.masterlens.org/citation.php?refer...
4,Treu et al. 2009 [ApJ690(2009)670],The SLACS Survey. VIII. The Relation between E...,,http://admin.masterlens.org/citation.php?refer...
5,Bolton et al. 2008 [ApJ682(2008)964],The Sloan Lens ACS Survey. V. The Full ACS Str...,Discovery Paper,http://admin.masterlens.org/citation.php?refer...


### Saving csv files

In [638]:
system_df.to_csv("system.csv", index=False)
coords_df.to_csv("coordinates.csv", index=False)
flux_df.to_csv("flux.csv")
z_df.to_csv("redshift.csv")
ref_df.to_csv("references.csv", index=False)

In [639]:
s = pd.read_csv('flux.csv', index_col=0)
s

Unnamed: 0_level_0,Filter,HST,Program Observations,SDSS,lens_mag,lens_mag_err,Flux (nmaggie),Flux (nmaggie)_err,ref (arcsec),ref (arcsec)_err,axis_ratio (AB),axis_ratio (AB)_err,PA (deg),PA (deg)_err
Band,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
u,Sloan u,,Has Sloan ugriz photometrySDSS Spectrum: 669-5...,SDSS (DR8),22.37,0.53,1.11,0.56,5.29,4.33,0.05,0.0,40.44,0.01
g,Sloan g,,Has Sloan ugriz photometrySDSS Spectrum: 669-5...,SDSS (DR8),20.9,0.06,4.35,0.25,1.52,0.3,0.81,0.23,171.15,0.01
r,Sloan r,,Has Sloan ugriz photometrySDSS Spectrum: 669-5...,SDSS (DR8),19.58,0.03,14.78,0.44,1.0,0.1,0.88,0.12,17.35,0.01
i,Sloan i,,Has Sloan ugriz photometrySDSS Spectrum: 669-5...,SDSS (DR8),18.75,0.02,31.72,0.65,0.99,0.06,0.91,0.08,51.95,0.01
z,Sloan z,,Has Sloan ugriz photometrySDSS Spectrum: 669-5...,SDSS (DR8),18.41,0.06,43.3,2.51,1.15,0.18,0.65,0.18,70.26,0.01
I,F814W,ACS/WFC (WFC1),HST Program 10886 — PI: Bolton,,18.65,0.03,,,1.71,0.06,0.83,0.01,27.3,0.1
V,F606W,WFPC2 (WF3),HST Program 10886 — PI: Bolton,,,,,,,,,,,
