## Integração de dados

#### Masterlens, CASTLES

In [1]:
import pandas as pd
from astropy.coordinates import Angle
import sqlite3
import os

In [2]:
# dec: -27:52:25.7 ---> -27.873806 deg
def sexdeg_to_deg(coord):
    result = Angle(coord + " degrees").degree
    return result #*3600


In [3]:
# ra: 00h49m41.89s ---> 12.424542 deg
def hour_to_deg(coord):  # string coord
    result = Angle(coord).degree
    return result #*3600

In [4]:
# ra: 00:49:41.89 ---> 00h49m41.89s
def hour(coord):
    result = coord.replace(":", 'h', 1)
    result = result.replace(":", 'm', 1)
    result += 's'
    return result

In [5]:
def name_clean(element):
    name = element.replace('+', '-')
    name = name.replace('.', '-')
    name = name.split('-')[0]
    
    if 'PMNJ' in name:
        name = name.replace('PMNJ', 'PMN')
        
    elif name == 'QJ0158':
        name = 'CTQ414'
        
    elif name == 'LBQS1009':
        name = name.replace('LBQS', 'Q')
    
    return name

In [6]:
# returns df with castles data (with coordinates converted to degrees)
def castles_df():
        # importing castles data  -  100 lenses
    castles = pd.read_csv('Scraping/castles.csv')

    castles['RA (J2000)'] = castles['RA (J2000)'].apply(hour)
    castles['RA (J2000)'] = castles['RA (J2000)'].apply(hour_to_deg)
    castles['Dec (J2000)'] = castles['Dec (J2000)'].apply(sexdeg_to_deg)
    castles['id'] = castles.index + 1 # from 1 to 100

    return castles

In [7]:
# returns df with castles data + imgs paths
def castles_imgs():
    
    castles_imgs = os.listdir('/home/viviane/GravLens/Database/Scraping/castles_imgs')
    names = []
    paths =[]
    for i in castles_imgs:
        name = i.rstrip('.gif')
        name = name.rstrip('H')
        name = name.rstrip('Hcc')
        name = name.rstrip('I')
        name = name.rstrip('Icc')
        name = name.rstrip('V')
        name = name.rstrip('Vcc')
        name = name.rstrip('K')
        name = name.rstrip('Kcc')
        name = name.rstrip('J')
        name = name.rstrip('Jcc')
        name = name.rstrip('R')
        name = name.rstrip('Rcc')
        
        names.append(name)
        path = i
        paths.append(path)

    images = {'name' : names, 'img_path' : paths }
    imgs_df = pd.DataFrame(images)
    
    # name - list of images
    imgs_df = imgs_df.groupby('name')['img_path'].apply(list)
    
    # castles df saved before in csv
    castles = castles_df()
    castles['name'] = castles['Lens Name'].apply(name_clean)
    castles.set_index(castles['name'], inplace=True)
    castles.drop('name', axis=1, inplace=True)
    
    # joining imgs df with system df
    result = pd.concat([castles, imgs_df], axis=1, join='outer')
    result.reset_index(drop=True, inplace=True)
        
    return result

In [74]:
# importing masterlens data  -  n(50) lenses
# returns df with masterlens data from tables: system, coordinates(deg) and image paths

def masterlens_df(n):
    
    ml_files_path = '/home/viviane/GravLens/Database/Scraping/masterlens_results'
    ml_imgfolders_path = '/home/viviane/GravLens/Database/Scraping/masterlens_imgs'
    
    dfs=[]
    for i in range(1, n+1):
        
    # getting data from system file      
        ml = pd.read_csv(ml_files_path + '/system_{}.csv'.format(i))

            # selecting important columns
        cols=[]
        for column in ['Name', 'Alternate Names', 'Discovery', 'Discovery Date', 'Lens Kind', 'Lens Grade',
                        'Description', 'N Images', 'Einstein_R', 'Einstein_R quality',
                        'Stellar_v_disp', 'Stellar_v_disp_err']:
            if column in list(ml.columns):
                cols.append(column)
        ml = ml[cols]


    # getting coords from coords file
        coords = pd.read_csv(ml_files_path + '/coordinates_{}.csv'.format(i))
        ml['Ra(deg)_ml'] = coords['RA [°]']
        ml['Dec(deg)_ml'] = coords['Dec [°]']
    
    
    # getting image paths
        system_folder = ml_imgfolders_path + '/masterlens_{}_arquivos'.format(i)
        imgs_list = os.listdir(system_folder)
        imgs = [system_folder + img for img in imgs_list]
        ml['img_paths'] =  str(imgs)
        
        
        ml['id'] = i
        dfs.append(ml)
        
    all_ml = pd.concat(dfs, sort=False)
        
    return all_ml

## Lens as a whole
### Merge  - Name

In [11]:
# system table ---------->  143 lenses
def system():  # lens as a whole
    
    # importing castles data  -  100 lenses
    castles = pd.read_csv('Scraping/castles.csv')
    castles.drop(['zs', 'zl', 'ms (mag)', 'ml (mag)', 'dt (days)'], axis=1, inplace=True)
    castles['RA (J2000)'] = castles['RA (J2000)'].apply(hour)
    castles['RA (J2000)'] = castles['RA (J2000)'].apply(hour_to_deg)
    castles['Dec (J2000)'] = castles['Dec (J2000)'].apply(sexdeg_to_deg)
    castles.columns=['Name', 'RA(deg)_c', 'Dec(deg)_c', 'E(B-V)_c', 'FGHz (mJy)_c', 'Nimages_c',
                     'size(arccsec)_c', 'sigma (km/s)_c']
    #castles = castles.set_index('Name')
    castles['id_c'] = castles.index
    
    
    # importing masterlens data  -  50 lenses
    dfs = []
    for i in range(1,51):
        
        # getting data from system file
        ml = pd.read_csv('Scraping/masterlens_results/system_{}.csv'.format(i))

        # selecting important columns
        cols=[]
        for column in ['Name', 'Alternate Names', 'Discovery', 'Discovery Date', 'Lens Kind', 'Lens Grade',
                       'Description', 'N Images', 'Einstein_R', 'Einstein_R quality',
                       'Stellar_v_disp', 'Stellar_v_disp_err']:
            if column in list(ml.columns):
                cols.append(column)
        ml = ml[cols]

        # getting coords from coords file
        coords = pd.read_csv('Scraping/masterlens_results/coordinates_{}.csv'.format(i))
        ml['Ra(deg)_ml'] = coords['RA [°]']
        ml['Dec(deg)_ml'] = coords['Dec [°]']

        dfs.append(ml)
     
    # joining all masterlens system + coords dfs
    all_ml = pd.concat(dfs, sort=False)
    #all_ml = all_ml.set_index('Name')
    #all_ml['id_ml'] = range(100, 150)
    
    
    concat = pd.concat([castles, all_ml], sort=False)
    concat.reset_index(inplace=True, drop=True)
    concat.to_csv('concat.csv')
    # joining castles + ml data
    result = pd.merge(castles, all_ml, how ='outer', on='Name')  


    return result
        


### Merge - NACluster

In [12]:
# Preparing to run NACluster
def system_cluster():  # lens as a whole
    
    # importing castles data  -  100 lenses
    castles = castles_df()
    castles = castles[['RA (J2000)', 'Dec (J2000)' ]]
    castles.columns = ['ra', 'dec']
    
    castles['idCatalog'] = 1
   
    
    # importing masterlens data  -  50 lenses
    ml = masterlens_df()
    ml = ml[['Ra(deg)_ml', 'Dec(deg)_ml']]
    ml.columns = ['ra', 'dec']
    
    ml['idCatalog'] = 2
    
    # concatenating both catalogs to use Nacluster
    final = pd.concat([castles, ml])
    final.reset_index(drop = True, inplace=True)
    final = final.sample(frac=1)  # shuffling dataframe
    
    # saving csv file
    #system_na .to_csv('catalogs.csv', header=False)

    return final

In [13]:
system_na = system_cluster()
system_na

Unnamed: 0,ra,dec,idCatalog
112,32.372207,-6.719794,2
86,294.604958,66.814500,1
59,203.894958,1.301528,1
34,150.336583,55.897056,1
122,41.642076,-8.426707,2
...,...,...,...
43,157.305625,26.392167,1
105,9.271704,9.163992,2
44,158.392000,7.190417,1
69,216.158708,22.933500,1


In [14]:
# After running NACluster (Java)
# system table  ------------> 138 clustres
def NAC_result():
    old_file = open("clusters_catalogs.csv", 'r+')
    new_file = open("clusters.csv", 'a+')

    lastline=''
    cluster = 0
    header='id,ra,dec,cluster\n'
    new_file.write(header)
    for line in old_file:

        if 'cluster' in line:
            cluster+=1

        elif "Centroid" not in line:
            line = line.replace('(', '')
            line = line.replace(')', '')
            
            newline = ','.join(line.split(',')[0:3]) + ',{}'.format(cluster)
            new_file.write(newline)
            new_file.write('\n')
            
               
    old_file.close()
    new_file.close()
    
    return None
        

In [84]:
NAC_result()

In [101]:
def integrate():
    castle = castles_imgs()
    ml = masterlens_df(50)
    
    nac = pd.read_csv('clusters.csv')
    nac = nac.sort_values(by=['id'])
    nac.reset_index(inplace=True, drop=True)
    
    # first concat nac with castle
    result = pd.concat([nac, castle], join ='outer', sort=True, axis=1)  

    # then concat nac + castle with ml
#    integrated = pd.concat([result, ml], join ='outer', sort=True, axis=1)
    
    # combining dup rows into one   
#    integrated = integrated.groupby('cluster').first()
    #ntegrated.columns = ['id','ra_c','dec_c','Lens Name _c','zs','zl','RA (J2000)','Dec (J2000)','E(B-V)','ms (mag)',
      #            'ml (mag)','FGHz (mJy)','Nim',"size ("")",'dt (days)','sigma (km/s)',
       #           'id_c','Name','Discovery','Discovery Date','Lens Kind','Lens Grade','Description',
        #          'N Images','Einstein_R','Einstein_R quality','Stellar_v_disp','Stellar_v_disp_err',
         #         'Ra(deg)_ml','Dec(deg)_ml','Alternate Names','id_ml']

#    system = integrated.drop('id', axis=1)
#    system['isReal'] = 1  #  column to distinguish between real and simulated lenses (real=1, sim=0)
#    system['Target'] = 1  #  column to distinguish between images with ot without lens (lens=1, nolens=0)

#    system.to_csv("integrated_system.csv", index=False)
    return ml.shape

In [102]:
integrate()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




(50, 16)

In [18]:
# Comparing merge by name with NAC

clusters = list(nac["cluster"])
dupes = list(set([x for x in clusters if clusters.count(x) > 1])) # cluster duplicates

clusters_dupes = nac.loc[nac['cluster'].isin(dupes)] 
#clusters_dupes

#merge_name = pd.read_csv('concat.csv')
#problem = [31, 141, 37, 149, 27, 135, 12, 125, 10, 122]
#merge_name.loc[merge_name.index.isin(problem)]

# Merge by name is wrong
# Use merge with NAC

NameError: name 'nac' is not defined

## Lens objects

In [19]:
# Scraping Joao
# lens objects  -  227
castles2 = pd.read_csv('CastelLensData.csv')
castles2 = castles2.set_index('lens_names')

HE_c = castles2.loc[ 'HE0435-1223' , : ]
HE_c

Unnamed: 0_level_0,ra,ra_err,dec,dec_err,description
lens_names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
HE0435-1223,4.63747,0.0,-12.2873,0.0,deflected
HE0435-1223,3.16147,0.003,-11.7343,0.003,deflected
HE0435-1223,2.17047,0.003,-12.8903,0.005,deflected
HE0435-1223,3.69847,0.003,-13.9013,0.003,deflected
HE0435-1223,3.47247,0.003,-12.8603,0.003,galaxy


In [21]:
# Object table
#HE_ml = pd.read_csv('Scraping/results/redshift_28.csv')
#HE_ml.drop('Unnamed: 1', axis=1, inplace=True)
#HE_ml.rename(columns={"Unnamed: 0": "Description"})

## Main code

In [158]:
# Preparing to run NACluster
system_prep = system_cluster()

# run NACluster (Java)
# Saves results in "clusters_catalogs.csv"

# After running NACluster:
# reads "clusters_catalogs.csv"
# Saves result in "clusters.csv"
NAC_result()

# Final system df after integration
# reads "clusters.csv"
# Saves result to "integrated_system.csv"
system_df = integrate()

