## Integração de dados

#### Masterlens, CASTLES

In [189]:
import pandas as pd
from astropy.coordinates import Angle
import sqlite3
import os

In [190]:
# dec: -27:52:25.7 ---> -27.873806 deg
def sexdeg_to_deg(coord):
    result = Angle(coord + " degrees").degree
    return result #*3600


In [191]:
# ra: 00h49m41.89s ---> 12.424542 deg
def hour_to_deg(coord):  # string coord
    result = Angle(coord).degree
    return result #*3600

In [192]:
# ra: 00:49:41.89 ---> 00h49m41.89s
def hour(coord):
    result = coord.replace(":", 'h', 1)
    result = result.replace(":", 'm', 1)
    result += 's'
    return result

In [193]:
def name_clean(element):
    name = element.replace('+', '-')
    name = name.replace('.', '-')
    name = name.split('-')[0]
    
    if 'PMNJ' in name:
        name = name.replace('PMNJ', 'PMN')
        
    elif name == 'QJ0158':
        name = 'CTQ414'
        
    elif name == 'LBQS1009':
        name = name.replace('LBQS', 'Q')
    
    return name

In [199]:
def castles_df():
        # importing castles data  -  100 lenses
    castles = pd.read_csv('Scraping/castles.csv')

    castles['RA (J2000)'] = castles['RA (J2000)'].apply(hour)
    castles['RA (J2000)'] = castles['RA (J2000)'].apply(hour_to_deg)
    castles['Dec (J2000)'] = castles['Dec (J2000)'].apply(sexdeg_to_deg)
    castles['id'] = castles.index + 1 # from 1 to 100

    return castles

In [202]:
def castles_imgs():
    
    castles_imgs = os.listdir('/home/viviane/GravLens/Database/Scraping/castles_imgs')
    names = []
    paths =[]
    for i in castles_imgs:
        name = i.rstrip('.gif')
        name = name.rstrip('H')
        name = name.rstrip('Hcc')
        name = name.rstrip('I')
        name = name.rstrip('Icc')
        name = name.rstrip('V')
        name = name.rstrip('Vcc')
        name = name.rstrip('K')
        name = name.rstrip('Kcc')
        name = name.rstrip('J')
        name = name.rstrip('Jcc')
        name = name.rstrip('R')
        name = name.rstrip('Rcc')
        
        names.append(name)
        path = i
        paths.append(path)

    images = {'name' : names, 'img_path' : paths }
    imgs_df = pd.DataFrame(images)
    
    # name - list of images
    imgs_df = imgs_df.groupby('name')['img_path'].apply(list)
    
    # castles df saved before in csv
    castles = castles_df()
    castles['name'] = castles['Lens Name'].apply(name_clean)
    castles.set_index(castles['name'], inplace=True)
    castles.drop('name', axis=1, inplace=True)
    
    # joining imgs df with system df
    result = pd.concat([castles, imgs_df], axis=1, join='outer')
    result.reset_index(drop=True, inplace=True)
        
    return result

In [37]:
def masterlens_df():
        # importing masterlens data  -  50 lenses
    dfs = []
    for i in range(1,51):
        
        # getting data from system file
        ml = pd.read_csv('Scraping/masterlens_results/system_{}.csv'.format(i))

        # selecting important columns
        cols=[]
        for column in ['Name', 'Alternate Names', 'Discovery', 'Discovery Date', 'Lens Kind', 'Lens Grade',
                       'Description', 'N Images', 'Einstein_R', 'Einstein_R quality',
                       'Stellar_v_disp', 'Stellar_v_disp_err']:
            if column in list(ml.columns):
                cols.append(column)
        ml = ml[cols]

        # getting coords from coords file
        coords = pd.read_csv('Scraping/masterlens_results/coordinates_{}.csv'.format(i))
        ml['Ra(deg)_ml'] = coords['RA [°]']
        ml['Dec(deg)_ml'] = coords['Dec [°]']

        dfs.append(ml)
        
    all_ml = pd.concat(dfs, sort=False)
    all_ml['id'] = list(range(100, 150))  # from 100 to 149
    all_ml = all_ml.set_index(all_ml['id'])
        
    return all_ml

In [213]:
# creating dataframe with ml images and their system_id
# saving in csv file
def ml_imgs():
    
    imgs_folder = os.listdir('/home/viviane/GravLens/Database/Scraping/masterlens_imgs')

    imgs=[]
    system_ids=[]

    for folder in imgs_folder:
        system_folder = '/home/viviane/GravLens/Database/Scraping/masterlens_imgs/' + folder + '/'
        imgList = os.listdir(system_folder)
        
        id = folder.strip(('masterlens_'))
        id = id.strip('_arquivo')
        
        for img in imgList:
            
            img_path = system_folder + img
            imgs.append(img_path)
            system_ids.append(id)
            
    images = {'system_id_ml' : system_ids,
             'path' : imgs}

    images_df = pd.DataFrame(images)
    images_df['image_id'] = images_df.index
    images_df.to_csv('ml_images', index=False)

    return images_df

In [214]:
ml_imgs()['path'][1]

'/home/viviane/GravLens/Database/Scraping/masterlens_imgs/masterlens_27_arquivos/MG04140534_WFPC2_greyimage_F555W_ZOOM6.png'

## Lens as a whole
### Merge  - Name

In [15]:
# system table ---------->  143 lenses
def system():  # lens as a whole
    
    # importing castles data  -  100 lenses
    castles = pd.read_csv('Scraping/castles.csv')
    castles.drop(['zs', 'zl', 'ms (mag)', 'ml (mag)', 'dt (days)'], axis=1, inplace=True)
    castles['RA (J2000)'] = castles['RA (J2000)'].apply(hour)
    castles['RA (J2000)'] = castles['RA (J2000)'].apply(hour_to_deg)
    castles['Dec (J2000)'] = castles['Dec (J2000)'].apply(sexdeg_to_deg)
    castles.columns=['Name', 'RA(deg)_c', 'Dec(deg)_c', 'E(B-V)_c', 'FGHz (mJy)_c', 'Nimages_c',
                     'size(arccsec)_c', 'sigma (km/s)_c']
    #castles = castles.set_index('Name')
    castles['id_c'] = castles.index
    
    
    # importing masterlens data  -  50 lenses
    dfs = []
    for i in range(1,51):
        
        # getting data from system file
        ml = pd.read_csv('Scraping/masterlens_results/system_{}.csv'.format(i))

        # selecting important columns
        cols=[]
        for column in ['Name', 'Alternate Names', 'Discovery', 'Discovery Date', 'Lens Kind', 'Lens Grade',
                       'Description', 'N Images', 'Einstein_R', 'Einstein_R quality',
                       'Stellar_v_disp', 'Stellar_v_disp_err']:
            if column in list(ml.columns):
                cols.append(column)
        ml = ml[cols]

        # getting coords from coords file
        coords = pd.read_csv('Scraping/masterlens_results/coordinates_{}.csv'.format(i))
        ml['Ra(deg)_ml'] = coords['RA [°]']
        ml['Dec(deg)_ml'] = coords['Dec [°]']

        dfs.append(ml)
     
    # joining all masterlens system + coords dfs
    all_ml = pd.concat(dfs, sort=False)
    #all_ml = all_ml.set_index('Name')
    #all_ml['id_ml'] = range(100, 150)
    
    
    concat = pd.concat([castles, all_ml], sort=False)
    concat.reset_index(inplace=True, drop=True)
    concat.to_csv('concat.csv')
    # joining castles + ml data
    result = pd.merge(castles, all_ml, how ='outer', on='Name')  


    return result
        


### Merge - NACluster

In [44]:
# Preparing to run NACluster
def system_cluster():  # lens as a whole
    
    # importing castles data  -  100 lenses
    castles = castles_df()
    castles = castles[['RA (J2000)', 'Dec (J2000)' ]]
    castles.columns = ['ra', 'dec']
    
    castles['idCatalog'] = 1
   
    
    # importing masterlens data  -  50 lenses
    ml = masterlens_df()
    ml = ml[['Ra(deg)_ml', 'Dec(deg)_ml']]
    ml.columns = ['ra', 'dec']
    
    ml['idCatalog'] = 2
    
    # concatenating both catalogs to use Nacluster
    final = pd.concat([castles, ml])
    final.reset_index(drop = True, inplace=True)
    final = final.sample(frac=1)  # shuffling dataframe
    
    # saving csv file
    #system_na .to_csv('catalogs.csv', header=False)

    return final

In [45]:
system_na = system_cluster()
system_na

Unnamed: 0,ra,dec,idCatalog
23,123.380417,25.750889,1
87,301.029458,-13.825194,1
131,125.053580,48.793630,2
115,33.546703,-4.084105,2
9,38.137917,-21.290556,1
...,...,...,...
107,12.424747,-27.874020,2
19,115.713333,36.578806,1
76,247.553750,82.499722,1
125,53.159208,-27.948111,2


In [20]:
# After running NACluster (Java)
# system table  ------------> 138 clustres
def NAC_result():
    old_file = open("clusters_catalogs.csv", 'r+')
    new_file = open("clusters.csv", 'a+')

    lastline=''
    cluster = 0
    header='id,ra,dec,cluster\n'
    new_file.write(header)
    for line in old_file:

        if 'cluster' in line:
            cluster+=1

        elif "Centroid" not in line:
            line = line.replace('(', '')
            line = line.replace(')', '')
            
            newline = ','.join(line.split(',')[0:3]) + ',{}'.format(cluster)
            new_file.write(newline)
            new_file.write('\n')
            
               
    old_file.close()
    new_file.close()
    
    return None
        

In [54]:
NAC_result()

In [206]:
def integrate():
    castle = castles_imgs()
    ml = masterlens_df()
    
    nac = pd.read_csv('clusters.csv')
    nac = nac.sort_values(by=['id'])
    nac.reset_index(inplace=True, drop=True)
    
    # first concat nac with castle
    result = pd.concat([nac, castle], join ='outer', sort=True, axis=1)  

    # then concat nac + castle with ml
    integrated = pd.concat([result, ml], join ='outer', sort=True, axis=1)
    
    # combining dup rows into one   
    integrated = integrated.groupby('cluster').first()
    i#ntegrated.columns = ['id','ra_c','dec_c','Lens Name _c','zs','zl','RA (J2000)','Dec (J2000)','E(B-V)','ms (mag)',
      #            'ml (mag)','FGHz (mJy)','Nim',"size ("")",'dt (days)','sigma (km/s)',
       #           'id_c','Name','Discovery','Discovery Date','Lens Kind','Lens Grade','Description',
        #          'N Images','Einstein_R','Einstein_R quality','Stellar_v_disp','Stellar_v_disp_err',
         #         'Ra(deg)_ml','Dec(deg)_ml','Alternate Names','id_ml']

    system = integrated.drop('id', axis=1)
    system['isReal'] = 1  #  column to distinguish between real and simulated lenses (real=1, sim=0)
    system['Target'] = 1  #  column to distinguish between images with ot without lens (lens=1, nolens=0)

    system.to_csv("integrated_system.csv", index=False)
    return system

In [207]:
integrate()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,ra,dec,Lens Name,zs,zl,RA (J2000),Dec (J2000),E(B-V),ms (mag),ml (mag),FGHz (mJy),Nim,"size ("")",dt (days),sigma (km/s),img_path,Name,Discovery,Discovery Date,Lens Kind,Lens Grade,Description,N Images,Einstein_R,Einstein_R quality,Stellar_v_disp,Stellar_v_disp_err,Ra(deg)_ml,Dec(deg)_ml,Alternate Names,isReal,Target
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
1,11.012080000000001,1.2201799999999998,B2045+265,1.28,0.87,311.834792,26.733667,0.232,I=22.02/3,I=21.06,F8=19,4,2.74,,,"[B2045H.gif, B2045V.gif, B2045Vcc.gif, B2045Hc...",,,,,,,,,,,,,,,1,1
10,9.271704,9.163992,B1933+503,2.63,0.76,293.628958,50.423222,0.095,,I=20.24,F5=59,10,1.0,,,"[B1933H.gif, B1933.gif, B1933V.gif, B1933I.gif]",,,,,,,,,,,,,,,1,1
100,30.54375,-11.153244,FSC10214+4724,2.29,(0.75),156.144167,47.153056,0.012,I=16.44/4,I=20.40,F5~0.1,2E,1.59,,,"[FSC10214I.gif, FSC10214J.gif, FSC10214Jcc.gif...",,,,,,,,,,,,,,,1,1
101,78.54491666666665,-33.43958333333333,,,,,,,,,,,,,,,SDSS J0216-0813,SLACS,2006-02,GAL-GAL,A,A faint but deﬁnite counterimage is seen to th...,3.0,1.16,SIE model,332.0,24.0,34.21893,-8.22927,"SL2S J02176-0513, UDS-01, SL2S J021737.18-0513...",1,1
102,104.65791999999999,-55.95,Q0047-2808,3.60,0.48,12.424542,-27.873806,0.016,I=16.47/2,I=20.05,F5~1,4ER,2.7,,229±15,"[Q0047I.gif, Q0047Hcc.gif, Q0047V.gif, Q0047Ic...",,,,,,,,,,,,,,,1,1
103,338.21208333333334,-60.54527777777778,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,1
104,308.4253333333333,-47.39527777777778,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,1
105,306.5434583333333,-45.607527777777776,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,1
106,279.61875,-34.461555555555556,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,1
107,330.3866666666666,-32.028888888888886,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,1


In [118]:
# Comparing merge by name with NAC

clusters = list(nac["cluster"])
dupes = list(set([x for x in clusters if clusters.count(x) > 1])) # cluster duplicates

clusters_dupes = nac.loc[nac['cluster'].isin(dupes)] 
#clusters_dupes

#merge_name = pd.read_csv('concat.csv')
#problem = [31, 141, 37, 149, 27, 135, 12, 125, 10, 122]
#merge_name.loc[merge_name.index.isin(problem)]

# Merge by name is wrong
# Use merge with NAC

## Lens objects

In [None]:
# Scraping Joao
# lens objects  -  227
castles2 = pd.read_csv('CastelLensData.csv')
castles2 = castles2.set_index('lens_names')

HE_c = castles2.loc[ 'HE0435-1223' , : ]
HE_c

In [None]:
# Object table
HE_ml = pd.read_csv('Scraping/results/redshift_28.csv')
HE_ml.drop('Unnamed: 1', axis=1, inplace=True)
HE_ml.rename(columns={"Unnamed: 0": "Description"})

## Main code

In [158]:
# Preparing to run NACluster
system_prep = system_cluster()

# run NACluster (Java)
# Saves results in "clusters_catalogs.csv"

# After running NACluster:
# reads "clusters_catalogs.csv"
# Saves result in "clusters.csv"
NAC_result()

# Final system df after integration
# reads "clusters.csv"
# Saves result to "integrated_system.csv"
system_df = integrate()

