## Integração de dados

#### Masterlens, CASTLES

In [1]:
import pandas as pd
from astropy.coordinates import Angle

In [2]:
# dec: -27:52:25.7 ---> -27.873806 deg
def sexdeg_to_deg(coord):
    result = Angle(coord + " degrees").degree
    return result #*3600


In [3]:
# ra: 00h49m41.89s ---> 12.424542 deg
def hour_to_deg(coord):  # string coord
    result = Angle(coord).degree
    return result #*3600

In [4]:
# ra: 00:49:41.89 ---> 00h49m41.89s
def hour(coord):
    result = coord.replace(":", 'h', 1)
    result = result.replace(":", 'm', 1)
    result += 's'
    return result

## Lens as a whole
### Merge  - Name

In [5]:
# system table ---------->  143 lenses
def system():  # lens as a whole
    
    # importing castles data  -  100 lenses
    castles = pd.read_csv('Scraping/castles.csv')
    castles.drop(['zs', 'zl', 'ms (mag)', 'ml (mag)', 'dt (days)'], axis=1, inplace=True)
    castles['RA (J2000)'] = castles['RA (J2000)'].apply(hour)
    castles['RA (J2000)'] = castles['RA (J2000)'].apply(hour_to_deg)
    castles['Dec (J2000)'] = castles['Dec (J2000)'].apply(sexdeg_to_deg)
    castles.columns=['Name', 'RA(deg)_c', 'Dec(deg)_c', 'E(B-V)_c', 'FGHz (mJy)_c', 'Nimages_c',
                     'size(arccsec)_c', 'sigma (km/s)_c']
    castles = castles.set_index('Name')
    
    
    # importing masterlens data  -  50 lenses
    dfs = []
    for i in range(1,51):
        
        # getting data from system file
        ml = pd.read_csv('Scraping/results/system_{}.csv'.format(i))

        # selecting important columns
        cols=[]
        for column in ['Name', 'Alternate Names', 'Discovery', 'Discovery Date', 'Lens Kind', 'Lens Grade',
                       'Description', 'N Images', 'Einstein_R', 'Einstein_R quality',
                       'Stellar_v_disp', 'Stellar_v_disp_err']:
            if column in list(ml.columns):
                cols.append(column)
        ml = ml[cols]

        # getting coords from coords file
        coords = pd.read_csv('Scraping/results/coordinates_{}.csv'.format(i))
        ml['Ra(deg)_ml'] = coords['RA [°]']
        ml['Dec(deg)_ml'] = coords['Dec [°]']

        dfs.append(ml)
     
    # joining all masterlens system + coords dfs
    all_ml = pd.concat(dfs, sort=False)
    all_ml = all_ml.set_index('Name')

    # joining castles + ml data
    result = pd.merge(castles, all_ml, how ='outer', on='Name')  


    return result
        


In [7]:
# comparing 'alternate names' with 'name'      partial merge?

for index, row in cats.iterrows():
    if type(row['Alternate Names']) == str:
        row['Alternate Names'] = row['Alternate Names'].split(', ')  # list with alternate names
        for i in row['Alternate Names']:
            if index == i:
                print('problem')


### Merge - NACluster

In [105]:
# Preparing to run NACluster
def system_cluster():  # lens as a whole
    
    # importing castles data  -  100 lenses
    castles = pd.read_csv('Scraping/castles.csv')
    castles = castles[['RA (J2000)', 'Dec (J2000)' ]]
    castles['RA (J2000)'] = castles['RA (J2000)'].apply(hour)
    castles['RA (J2000)'] = castles['RA (J2000)'].apply(hour_to_deg)
    castles['Dec (J2000)'] = castles['Dec (J2000)'].apply(sexdeg_to_deg)
    castles.columns=['ra', 'dec']  # degrees
    
    castles['idCatalog'] = 1
    #castles = castles.set_index('Name')
    
    
    # importing masterlens data  -  50 lenses
    dfs = []
    for i in range(1,51):

        # getting coords from coords file
        ml = pd.read_csv('Scraping/results/coordinates_{}.csv'.format(i))
        ml = ml[['RA [°]', 'Dec [°]']]
        ml.columns=['ra', 'dec']  # degrees
        ml['idCatalog'] = 2

        dfs.append(ml)
     
    # joining all masterlens system + coords dfs
    all_ml = pd.concat(dfs, sort=False)
    
    # concatenating both catalogs to use Nacluster
    final = pd.concat([castles,all_ml])
    final.reset_index(drop = True, inplace=True)
    final = final.sample(frac=1)  # shuffling dataframe
    
    # saving csv file
    system_na .to_csv('catalogs.csv', header=False)

    return final
        


In [106]:
system_na = system_cluster()

In [107]:
system_na #.loc[[37, 149], : ]

Unnamed: 0,ra,dec,idCatalog
55,186.533750,-0.100556,1
148,150.337000,55.897060,2
108,12.615500,-17.668890,2
35,150.369208,50.465806,1
80,252.681000,42.862500,1
106,11.012080,1.220180,2
68,214.402125,52.444444,1
27,135.895500,50.472000,1
149,151.143143,41.212109,2
23,123.380417,25.750889,1


In [81]:
# After running NACluster (Java)
# system table  ------------> 138 clustres
def NAC_result():
    old_file = open("clusters_catalogs.csv", 'r+')
    new_file = open("clusters.csv", 'a+')

    lastline=''
    cluster = 0
    header='id,ra,dec,cluster\n'
    new_file.write(header)
    for line in old_file:

        if 'cluster' in line:
            cluster+=1

        elif "Centroid" not in line:
            line = line.replace('(', '')
            line = line.replace(')', '')
            
            newline = ','.join(line.split(',')[0:3]) + ',{}'.format(cluster)
            new_file.write(newline)
            new_file.write('\n')
            
 #           if 'Centroid' not in lastline and 'cluster' not in lastline:  # Same cluster
  #              newline = ',' + ','.join(line.split(',')[0:3])
   #             new_file.write(newline)
    #            new_file.write('\n')
     #           print(newline)

            
      #      else:  # Different cluster
       #         newline = ','.join(line.split(',')[0:3]) + ',{}'.format(cluster)
        #        new_file.write(newline)
         #       new_file.write(',,')
                
        #lastline = line
                
    old_file.close()
    new_file.close()
    #df = pd.read_csv('clusters.csv')
    return None #df
        

In [97]:
NAC_result()


In [85]:
nac

Unnamed: 0,id,ra,dec,cluster
0,106,11.012080,1.220180,1
1,11,45.628750,0.100583,2
2,123,43.188380,0.666210,3
3,139,138.022120,0.483660,4
4,38,152.872875,1.723139,5
5,59,203.894958,1.301528,6
6,103,5.670492,14.519565,7
7,3,21.190000,3.851667,8
8,2,21.185000,3.866667,9
9,105,9.271704,9.163992,10


In [109]:
nac = pd.read_csv("clusters.csv")

clusters = list(nac["cluster"])
dupes = list(set([x for x in clusters if clusters.count(x) > 1]))

for i in dupes:
    a=nac.loc[nac['cluster'] == i]
    print(a)

      id                  ra                  dec cluster
144  143              148.75             -1.50139     134
145   33  148.75004166666665  -1.5013888888888889     134
295  143              148.75             -1.50139     134
296   33  148.75004166666665  -1.5013888888888889     134
      id         ra                 dec cluster
44   140  139.85927  27.347509999999996      42
195  140  139.85927  27.347509999999996      42
     id                  ra                 dec cluster
74   35  150.36920833333332  50.465805555555555      70
225  35  150.36920833333332  50.465805555555555      70
      id                 ra                  dec cluster
122   14  69.56208333333332  -12.287333333333333     113
123  127            69.5615            -12.28722     113
273   14  69.56208333333332  -12.287333333333333     113
274  127            69.5615            -12.28722     113
     id                  ra                 dec cluster
40   62  210.39812499999996  15.223777777777778      38
1

      id         ra                 dec cluster
21   145  149.84068  2.1106700000000003      20
172  145  149.84068  2.1106700000000003      20
    id         ra                 dec cluster
104  5  23.648625  -9.517472222222223      97
255  5  23.648625  -9.517472222222223      97
     id                  ra                dec cluster
58   19  115.71333333333332  36.57880555555556      56
209  19  115.71333333333332  36.57880555555556      56
     id                  ra                dec cluster
70   53  181.62354166666665  43.53822222222222      67
221  53  181.62354166666665  43.53822222222222      67
     id                  ra                 dec cluster
83   80  252.68099999999993  42.862500000000004      78
234  80  252.68099999999993  42.862500000000004      78
     id                  ra                dec cluster
36   54  182.73816666666664  9.907111111111112      34
187  54  182.73816666666664  9.907111111111112      34
     id                  ra                 dec cluster

In [112]:
nac.loc[nac['cluster'] == 64]

Unnamed: 0,id,ra,dec,cluster


## Lens objects

In [14]:
# Scraping Joao
# lens objects  -  227
castles2 = pd.read_csv('CastelLensData.csv')
castles2 = castles2.set_index('lens_names')

HE_c = castles2.loc[ 'HE0435-1223' , : ]
HE_c

Unnamed: 0_level_0,ra,ra_err,dec,dec_err,description
lens_names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
HE0435-1223,4.63747,0.0,-12.2873,0.0,deflected
HE0435-1223,3.16147,0.003,-11.7343,0.003,deflected
HE0435-1223,2.17047,0.003,-12.8903,0.005,deflected
HE0435-1223,3.69847,0.003,-13.9013,0.003,deflected
HE0435-1223,3.47247,0.003,-12.8603,0.003,galaxy


In [15]:
# Object table
HE_ml = pd.read_csv('Scraping/results/redshift_28.csv')
HE_ml.drop('Unnamed: 1', axis=1, inplace=True)
HE_ml.rename(columns={"Unnamed: 0": "Description"})

Unnamed: 0,Description,Mag,No.,z
0,Lens,,1,0.46
1,Source,18.3,1,1.689
2,Source,18.2,2,1.689
3,Source,18.2,3,1.689
4,Source,18.8,4,1.689
