## Goal
- ~Set up pandas dataframe of RV + MJD data to use in PCA~ - done!

## Steps
- ~Sort plates by n visits per target~ - done!
- ~Pick plate with n > 20 visits/target for initial pca~ - max 16

## Questions
- Should I remove targets which don't have the same number of visits? --> Yes
- What is up with the targets that have very large 

In [2]:
import numpy as np
import pandas as pd
import statistics as stat
from astropy.io import fits
from collections import Counter

In [3]:
# Read in FITS file

visit_hdus = fits.open('allVisit-r12-l33.fits')

# print(visit_hdus.info())

In [4]:
# HDU stuff

visit_header = visit_hdus[1].header
visit_data = visit_hdus[1].data

visit_hdus.close()

# print(visit_header)

In [5]:
# Set up variables for FITS file data

all_targets = list(visit_data['TARGET_ID'])
all_mjd = list(visit_data['MJD'])
all_obsvhelio = list(visit_data['OBSVHELIO']) # Heliocentric relative RV from 'observed spectrum template matching'
all_plates = list(visit_data['PLATE'])

In [6]:
# Strip whitespace from Plate IDs

for i, s in enumerate(all_plates):
    all_plates[i] = s.strip()
    
# print(all_plates[:7])

In [7]:
# Create pandas dataframe for all data

all_data = [all_targets, all_plates, all_mjd, all_obsvhelio]

df = pd.DataFrame(all_data).transpose()
df.columns = ['Target ID', 'Plate ID', 'MJD', 'OBSVHELIO (km/s)']

In [61]:
# Show all relevant FITS file data

df.sort_values('OBSVHELIO (km/s)')

Unnamed: 0,Target ID,Plate ID,MJD,OBSVHELIO (km/s)
1106307,apo25m.5118.URMINOR.2M15062489+6715577,8628,58298,-1769
467533,lco25m.5214.CARINA.2M06472679-5057077,10209,58212,-1640.69
786088,apo25m.4503.160+60.2M11002078+4809563,7348,56729,-1628.46
1026410,apo25m.5748.BOOTES1.2M13544089+1520421,10894,58292,-1616.21
1108939,apo25m.5118.URMINOR.2M15075464+6710445,8628,57468,-1580.11
...,...,...,...,...
47659,lco25m.5499.SMC3.2M00375577-7343181,10083,58028,999999
380012,apo25m.4568.191-04.2M05522126+1734585,6762,56565,999999
380013,apo25m.4568.191-04.2M05522126+1734585,6762,56566,999999
1696691,apo25m.5110.M15.2M21293076+1206327,8706,58296,999999


In [9]:
# Get list of unique plates

unique_plates = []

for i in Counter(df['Plate ID']):
    if i not in unique_plates:
        unique_plates.append(i)
        
# print("Number of unique plates:", len(unique_plates))

In [10]:
# Get mode number of visits per target on each plate

nvisits_mode = []
# filtered_plates = []

for p in unique_plates:
    dff = df[df['Plate ID'] == p]
    count_dict = Counter(dff['Target ID'])
    mode_visits = stat.mode(count_dict.values())
    nvisits_mode.append(mode_visits)
#    if mode_visits > 19:
#        filtered_plates.append(p)

In [11]:
# Create pandas df of mode number of visits per target on each plate

visitcount_data = [unique_plates, nvisits_mode]
visitcount_df = pd.DataFrame(visitcount_data).transpose()
visitcount_df.columns = ['Plate ID', 'Mode Visit Count/Target']

In [12]:
# Show plates with more than 10 visits per target

visitcount_df[visitcount_df['Mode Visit Count/Target'] > 10] #['Mode Visit Count/Target'].sort_values(ascending=False)

Unnamed: 0,Plate ID,Mode Visit Count/Target
70,9518,11
154,9244,11
234,9290,16
634,8907,14
1269,5631,11
1372,8112,12
1981,9860,12


In [13]:
# Create pandas df for plate 9290

plate9290_df = df[df['Plate ID'] == '9290']
plate9290_df

Unnamed: 0,Target ID,Plate ID,MJD,OBSVHELIO (km/s)
164071,apo25m.5226.150-08-RV.2M03252400+4614203,9290,57706,-86.5657
164072,apo25m.5226.150-08-RV.2M03252400+4614203,9290,57732,-86.3834
164073,apo25m.5226.150-08-RV.2M03252400+4614203,9290,57734,-86.3869
164074,apo25m.5226.150-08-RV.2M03252400+4614203,9290,57735,-86.5328
164075,apo25m.5226.150-08-RV.2M03252400+4614203,9290,57760,-86.3666
...,...,...,...,...
192942,apo25m.5226.150-08-RV.2M03415658+4626067,9290,58068,-26.4901
192943,apo25m.5226.150-08-RV.2M03415658+4626067,9290,58085,-26.5953
192944,apo25m.5226.150-08-RV.2M03415658+4626067,9290,58087,-26.604
192945,apo25m.5226.150-08-RV.2M03415658+4626067,9290,58114,-26.5992


In [18]:
# Check visit count per target

target_nvisits = Counter(plate9290_df['Target ID'])

In [33]:
# Identify all targets which have < 16 visits

target_nvisits_data = [list(target_nvisits.keys()), list(target_nvisits.values())]

target_nvisits_df = pd.DataFrame(target_nvisits_data).transpose()
target_nvisits_df.columns = ['Target ID', 'Visit Count']

targets2drop = target_nvisits_df[target_nvisits_df['Visit Count'] < 16]
targets2drop

Unnamed: 0,Target ID,Visit Count
3,apo25m.5226.150-08-RV.2M03261444+4653207,12
12,apo25m.5226.150-08-RV.2M03265000+4642270,15
23,apo25m.5226.150-08-RV.2M03275199+4708407,12
65,apo25m.5226.150-08-RV.2M03301731+4617230,12
78,apo25m.5226.150-08-RV.2M03304866+4620444,14
79,apo25m.5226.150-08-RV.2M03304988+4625062,15
82,apo25m.5226.150-08-RV.2M03310220+4526009,13
89,apo25m.5226.150-08-RV.2M03312204+4557144,15
100,apo25m.5226.150-08-RV.2M03315044+4701564,15
137,apo25m.5226.150-08-RV.2M03334022+4534480,15


In [50]:
targets2drop_list = list(targets2drop['Target ID'])

# len(targets2drop_list)
sum(list(targets2drop['Visit Count']))

554

In [59]:
# Drop all targets which were visited < 16 times

for target in targets2drop_list:
    plate9290_df_clean = plate9290_df[(plate9290_df['Target ID'] != target)]
    # print(target)
    # print(plate9290_df_clean)
    plate9290_df = plate9290_df_clean

plate9290_df_clean

Unnamed: 0,Target ID,Plate ID,MJD,OBSVHELIO (km/s)
164071,apo25m.5226.150-08-RV.2M03252400+4614203,9290,57706,-86.5657
164072,apo25m.5226.150-08-RV.2M03252400+4614203,9290,57732,-86.3834
164073,apo25m.5226.150-08-RV.2M03252400+4614203,9290,57734,-86.3869
164074,apo25m.5226.150-08-RV.2M03252400+4614203,9290,57735,-86.5328
164075,apo25m.5226.150-08-RV.2M03252400+4614203,9290,57760,-86.3666
...,...,...,...,...
192942,apo25m.5226.150-08-RV.2M03415658+4626067,9290,58068,-26.4901
192943,apo25m.5226.150-08-RV.2M03415658+4626067,9290,58085,-26.5953
192944,apo25m.5226.150-08-RV.2M03415658+4626067,9290,58087,-26.604
192945,apo25m.5226.150-08-RV.2M03415658+4626067,9290,58114,-26.5992


In [58]:
# Number of individual stars on this plate

len(Counter(plate9290_df_clean['Target ID']))

227

In [60]:
# Convert df to csv file

plate9290_df_clean.to_csv('plate9290_RVs.csv') # Note that 0th column is preserved and must be removed when importing

In [98]:
# No longer relevant - setup for pandas df of visit count per target

# target_count = Counter(df['Target ID'])
# target_count = Counter(all_targets)

# filtered_targets = []
# filtered_targets_count = []

# for tc in target_count:
#     if target_count[tc] > 19:
#         filtered_targets.append(tc)
#         filtered_targets_count.append(target_count[tc])
        
# filtered_targets_data = [filtered_targets, filtered_targets_count]
# targets_df = pd.DataFrame(filtered_targets_data).transpose()
# targets_df.columns = ['Target ID', 'Visit Count']

# targets_df

# df[df['Target ID'] == 'apo25m.4230.M15.2M21342357+1215247']