### **IMPORTS**

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import plotly.graph_objects as go

R_J = 0.91  # From Yuan et al. 2013
R_K = 0.39

### **LOADING IN THE DATA**

In [8]:
file = open('distance_color_type_for_all_stars.txt', 'r')
file_csv = open('distance_color_type_for_all_stars_csv.csv', 'w')

header = [
    'ASASSN_id', 'distance', 'distance_low', 'distance_high', 'g_mag', 'Gaia_name', 'G_mag', 'G_mag_err',
    'abs_mag_G', 'abs_mag_G_low', 'abs_mag_G_high', 'ext_correct_G', 'ext_correct_G_low', 'ext_correct_G_high',
    'BP_RP', 'BP_RP_err', 'BP_RP_0', 'BP_RP_0_err', 'J_mag', 'J_mag_err', 'K_mag', 'K_mag_err',
    'abs_mag_J', 'abs_mag_J_low', 'abs_mag_J_high', 'ext_correct_J', 'ext_correct_J_low', 'ext_correct_J_high',
    'ext_correct_K', 'ext_correct_K_low', 'ext_correct_K_high', 'FinalType', 'ML_classifier_type',
    'visual_classification_type', 'VSX_type', 'VSX_RA', 'VSX_Dec', 'VSX_name'
]

file_csv.write(','.join(header) + '\n')

lines = file.readlines()
for line in lines:
    line_split = line.split()
    
    # Remove the first 'ASASSN' entry
    if line_split[0] == 'ASASSN':
        line_split = line_split[1:]
    
    cleaned_data = []
    for item in line_split:
        if '=' in item:
            continue  # Skip items with '='
        cleaned_data.append(item)
    
    # Combine Gaia_name into a single string
    gaia_start = 5  # Adjusted index where Gaia_name starts after removing 'ASASSN'
    gaia_end = 8    # Adjusted index where Gaia_name ends
    gaia_name = ' '.join(cleaned_data[gaia_start:gaia_end])
    cleaned_data = cleaned_data[:gaia_start] + [gaia_name] + cleaned_data[gaia_end:]
    
    # Combine VSX_name into a single string
    vsx_name_start = 37  # Adjusted index where VSX_name starts
    vsx_name = ' '.join(cleaned_data[vsx_name_start:])
    cleaned_data = cleaned_data[:vsx_name_start] + [vsx_name]

    all_letters = set(string.ascii_letters)
    if any(letter in all_letters for letter in cleaned_data[35]):
        continue
    
    file_csv.write(','.join(cleaned_data) + '\n')

file.close()
file_csv.close()

In [9]:
color_csv = pd.read_csv('distance_color_type_for_all_stars_csv.csv')

### **FILTERING & ERROR CALCULATIONS**

In [10]:
clean_final = color_csv[(color_csv["distance"] < 99998) & (color_csv["K_mag"].between(1, 99.998, inclusive='neither'))].copy()
    # filtering distances and magnitudes that did not have real values in the original data 

clean_final["J-K"] = clean_final["J_mag"] - clean_final["K_mag"]
    # creating a new column for the J-K color

clean_final = clean_final[clean_final["J-K"] > -1]
    # filtering J-K colors that are not high enough

In [11]:
errors = clean_final[((clean_final["distance_high"] - clean_final["distance_low"]) / clean_final["distance"]) < 0.3].copy()
    # filtering out stars that have too high of an error value in the distance. current value is that the error must be lower than 30%

In [12]:
# COLOR ERROR PROPOGATION
errors['J-K_err'] = np.sqrt(errors['J_mag_err']**2 + errors['K_mag_err']**2)
    # when two measurements are subtracted (like J-K), the variance of the result is the sum of the variances, given by the equation sigma_J-K = sqrt(sigma_J^2 + sigma_K^2)


# DISTANCE UNCERTAINTY
errors['sigma_distance'] = (errors['distance_high'] - errors['distance_low']) / 2
errors['sigma_dist_term'] = (5/np.log(10)) * (errors['sigma_distance']/errors['distance'])
    # calculating uncertainty in distance modulus due to error in distance
    # the distance modulus is given by the equation mu = 5log_10(d) - 5
    # its error propagation gives sigma_mu = |d mu / d d| * sigma_d = (5 / ln(10)) * (sigma_d / d)


# CALCULATE E(B-V) VALUE FROM EXTINCTION
errors['E(B-V)'] = errors['ext_correct_J'] / R_J  # A_J = R_J * E(B-V) => E(B-V) = A_J/R_J
    # we have the extinction in the J-band and want to infer the underlying reddening, which is useful for dereddening stars and comparing extinction-corrected magnitudes
    # we can use the equation A_J = R_J * E(B-V) -> E(B-V) = (A_J / R_J)


# CALCULATE A_J AND E(J-K) FROM E(B-V)
errors['A_J'] = R_J * errors['E(B-V)']
errors['E(J-K)'] = (R_J - R_K) * errors['E(B-V)']
    # very similar to previous step, using E(J-K) = A_J - A_K = (R_J - R_K) * E(B-V)


# ABSOLUTE MAGNITUDE VALUES CORRECTED FOR EXTINCTION
errors['abs_mag_J'] = (errors['J_mag'] - 5 * np.log10(errors['distance']) + 5 - errors['A_J'])
    # M_J = m_J - 5log_10(d) + 5 - A_J

errors['abs_mag_J_err'] = np.sqrt(errors['J_mag_err']**2 + errors['sigma_dist_term']**2 + (R_J * 0.1  * errors['E(B-V)'])**2) #assuming 10% error
    # sigma_MJ^2 = sigma_mJ^2 + sigma_mu^2 + sigma_AJ^2
    # again, ASSUMING 10% ERROR ---- CHECK!!! COULD BE INACCURATE
    # if we do have errors on extinction values or E(B–V), we could replace the 0.1 * E(B-V) with the actual uncertainty


# QUALITY CUTS
filtered = errors[(errors['J-K_err'] < 0.05) & (errors['abs_mag_J_err'] < 0.15) & (errors['E(J-K)'] < 0.1) & (errors['A_J'] < 1.5)]
    # attempting to keep "high-quality" stars with:
        # low color error
        # low absolute magnitude error
        # low reddening
        # low extinction
    # these thresholds CAN BE ADJUSTED and SHOULD BE TINKERED AROUND WITH TO FIND THE BEST VALUES

In [15]:
filtered["FinalType"].unique()

array(['RRAB', 'RRC', 'EW', 'EB', 'BY', 'EA', 'SRA', 'CONSTANT', 'RR(B)',
       'RS', 'L', 'SXPHE', 'SR', 'DSCT', 'GDOR', 'NO', 'DCEP', 'CWA:',
       'BY:', 'LPV_MIRA_AGB_C', 'BY+UV', 'GDOR:', 'CEP', 'L:', 'CEPH'],
      dtype=object)

In [18]:
unwanted_types = ["BY", "SRA", "CONSTANT", "RS", "L", "SXPHE", "SR", "GDOR", "NO", "DCEP", "CWA:", "BY:", "LPV_MIRA_AGB_C", "BY+UV", "GDOR:", "CEP", "L:", "CEPH"]
    # creating a list of the type classifications that we do not want included in certain graphs and value calculations
wanted_types = filtered[~filtered["FinalType"].isin(unwanted_types)]
wanted_types["FinalType"].unique()

array(['RRAB', 'RRC', 'EW', 'EB', 'EA', 'RR(B)', 'DSCT'], dtype=object)

one more round of filtering should be done to:
- define the "RR region"
- create a list of objects that are classified as RR in FinalType but OUTSIDE the RR region
- create a list of objects that are NOT classified as RR in FinalType but INSIDE the RR region

additionally, as commented above, various values of previous filtering should be tested to find the most optimal values for filtering

### **GRAPHS & FIGURES**