### Setup

In [1]:
### Import useful packages
import numpy as np
import pandas as pd
import os
import socket
from datetime import datetime, timedelta
import multiprocessing
from scipy.sparse import csr_array
from scipy.sparse import lil_array
from scipy.sparse import save_npz
from scipy.sparse import load_npz

In [2]:
# Get paths to data
hostname = socket.gethostname()

if hostname == "blpc1" or hostname == "blpc2":
    data_path = "/datax/scratch/nstieg/"
elif hostname == "cosmic-gpu-1":
    data_path = "/mnt/cosmic-gpu-1/data0/nstiegle/"
else:
    raise Exception("Data path not known")

### Develop filter for hits within _hz of another hit

In [None]:

full_dataset_path = data_path + "25GHz_higher.pkl"
coherent_dataset_path = data_path + "25GHz_higher_coherent.pkl"
incoherent_dataset_path = data_path + "25GHz_higher_incoherent.pkl"
coherent_after_1_path = data_path + "25GHz_higher_coherent_post_filter1.pkl"
coherent_after_2_path = data_path + "25GHz_higher_coherent_post_filter2.pkl"
coherent_after_3_path = data_path + "25GHz_higher_coherent_post_filter3.pkl"

# Read in data
coherent = pd.read_pickle(coherent_dataset_path)

In [3]:
# Read in adjacency information
adjacency_path = os.path.expanduser("~") + "/BL-COSMIC-2024-proj/frequency_adjacency/adjacent_in_coherent/"
distances_path = adjacency_path + "coherent_within_1000hz.distances.npz"
mask_path = adjacency_path + "coherent_within_1000hz.mask.npz"
distances = load_npz(distances_path)
mask = load_npz(mask_path)
assert(distances.shape[0] == coherent.shape[0])
assert(mask.shape[0] == coherent.shape[0])

In [4]:
# # Let's look at how many collisions there were within a threshold
# Want to do (distances <= threshold) & mask to get all the values which are under the threshold
# including zeroes which are let in by the mask
# However, if we turn all the 0s in the sparse array to Trues, it's not sparse anymore (so it's super slow)
# So how do we maintain the sparsity?
# Well, we can look for the opposite first, the values which are outside the threshold
# Then we can remove those from the mask so there's a new mask just contains the values we care about below the threshold 
# Then the number of collisions within that threshold is the number of Trues in the new mask
# And if we want to get those distances (or the hits which are close to each other), then we can do distances[new_mask] (or new_mask.nonzero())
def find_collisions_at_threshold(threshold):
    # Get those outside threshold
    outside_threshold = distances > threshold

    # Remove those outside threshold from the mask
    # We want to do mask & (~outside_threshold), but note that ~outside_threshold produces an array which is mostly true
    # So instead we'll have to do mask - outside_threshold 
    # (which does xor, so it will have values which are in mask but not outside_threshold, and those in outside_threshold but not mask)
    # Finally, to get rid of values in outside_threshold but not mask, we'll and by mask
    new_mask = mask.multiply(mask - outside_threshold)

    return new_mask

In [6]:
# Find all hits which are within 2hz of another hit
within_2_hz = find_collisions_at_threshold(2e-6)
print(within_2_hz.sum(), "collisions")

In [11]:
# Get the indices of those hits
indices_of_hits_within_2hz = np.unique(np.concatenate(within_2_hz.nonzero()))
print(len(indices_of_hits_within_2hz))

In [17]:
# Make a df which is all but those hits

# Get a list of booleans which are true for hits which have another hit within 2hz
hit_with_close_neighbor = np.zeros(len(coherent), dtype=bool)
hit_with_close_neighbor[indices_of_hits_within_2hz] = True

# Get list of booleans which are true for hits with no other hits within 2hz of them
hits_with_no_close_neighbor = ~hit_with_close_neighbor

# Then index the coherent dataframe at those locations
lonely_hits = coherent.iloc[hits_with_no_close_neighbor]

print(len(lonely_hits))

2262302


In [None]:
# Get datasets of just the data that had a collision or didn't
indices_of_hits_with_collisions = np.unique(np.concatenate(hit_indices))
bool_indices_of_hits_with_collisions = np.zeros(len(first_source), dtype=bool)
bool_indices_of_hits_with_collisions[indices_of_hits_with_collisions] = True
zero_distance_dataset = first_source.iloc[bool_indices_of_hits_with_collisions]
non_zero_distance_dataset = first_source.iloc[~bool_indices_of_hits_with_collisions]
print(f"Collisions dataset: {zero_distance_dataset.shape}")
print(f"No Collisions dataset: {non_zero_distance_dataset.shape}")

### Look for hits which drifted from a previous hit

In [3]:
from astropy.time import Time

# # Get the coherent dataset but with all the columns
# full_dataset_path = data_path + "25GHz_higher.pkl"
# df = pd.read_pickle(full_dataset_path)
# full_coherent = df[(df.source_name != "Incoherent") & (df.source_name != "PHASE_CENTER")].copy()
# full_coherent["tstart_h"] = Time(full_coherent["tstart"], format="mjd").datetime
# full_coherent.to_pickle(data_path + "25GHz_higher_coherent_all_columns.pkl")

# Read in coheren dataset with all columns
full_coherent = pd.read_pickle(data_path + "25GHz_higher_coherent_all_columns.pkl")

In [4]:
# Pass in row of dataframe for a single hit, get the error on that drift rate
def sigma_drift_rate(hit):
    # Error propagation on the error of the drift rate as dr =  df/dt (change in frequency / change in time)
    signal_dt = hit.tsamp * hit.signal_num_timesteps # Total number of seconds observed for
    signal_dr = hit.signal_drift_rate # Drift rate observed
    sigma_df = 2 # Error in measured frequency - 2Hz bins
    sigma_dt = hit.tsamp # Error in measured time - tsamp integration time per timestep
    return abs((signal_dr / signal_dt) * np.sqrt((sigma_df/ signal_dr)**2 + (sigma_dt)**2)) # Error propagation formula for division substituting df = dr * dt
    

In [5]:
# So if a hit has a drift rate, we might expect to see another signal that far away from it
# if there's a close re-observation. Let's see if we can look for that

# So we should group by source 
sources = full_coherent.groupby('source_name')
source_names = list(sources.groups.keys())

# We'll just look at the first source
source = sources.get_group(source_names[0])

# And then let's group by time
times = source.groupby('tstart_h')
all_times = list(times.groups.keys())

# Then let's look at the first hit in that time
first_time = times.get_group(all_times[0])
hit = first_time.iloc[0]

# Some info about that hit
f = hit.signal_frequency
dr = hit.signal_drift_rate
dr_mhz = dr * 1e-6
source_name = hit.source_name
time = hit.tstart_h
print(f"The first hit is of source {source_name} at {round(f, 6)}MHz with drift rate {round(dr, 3)}Hz/s at {time}")

# Now let's look in the next time to see if there's a signal within that drift rate
dt_h = all_times[1] - all_times[0]
dt = dt_h.total_seconds()
print(f"The next time is {dt} seconds laer")
dt = dt_h.total_seconds()
sigma_dr_hz = sigma_drift_rate(hit)
drift_hz = dt * dr
drift_mhz = dt * dr_mhz
sigma_drift_hz = dt * sigma_dr_hz
print(f"So {dt} seconds drifting at {round(dr, 3)} +/- {round(sigma_dr_hz, 3)}Hz/s gives {round(drift_hz, 3)} +/- {round(sigma_drift_hz, 3)}Hz expected drift")

target_f = f + (drift_hz * 1e-6)
second_time = times.get_group(all_times[1])
ebar_mhz = max(dt * sigma_drift_rate(hit) * 1e-6, 2 * 1e-6) # Mhz
candidates = second_time[(second_time.signal_frequency > target_f - ebar_mhz) & (second_time.signal_frequency < target_f + ebar_mhz)]
print(f"So there is/are {len(candidates)} candidate(s) within +/- {round(ebar_mhz * 1e6, 3)}Hz of the target frequency of original_frequency + {round(drift_hz, 3)}Hz")
closest_candidates = candidates.copy().sort_values("signal_frequency").reset_index(drop=True)
closest_candidate = closest_candidates.iloc[0]
print(f"The closest candidate was {round((closest_candidate.signal_frequency - target_f) * 1e6, 3)} Hz off, {round((closest_candidate.signal_frequency - f) * 1e6, 3)}Hz from the original hit")

The first hit is of source 2535280716217508992 at 33262.648879MHz with drift rate -0.243Hz/s at 2023-12-29 03:49:37.462265
The next time is 58.720261 seconds laer
So 58.720261 seconds drifting at -0.243 +/- 0.239Hz/s gives -14.242 +/- 14.028Hz expected drift
So there is/are 1 candidate(s) within +/- 14.028Hz of the target frequency of original_frequency + -14.242Hz
The closest candidate was 10.427 Hz off, -3.815Hz from the original hit


In [6]:
# TOOD / FUTURE WORK
# - Figure out what the distribution of dts are for sources
# - The results of this search approach are naturally described by a directed graph (or collection of trees)
#   where nodes are hits and hits point to hits they may have drifted to. By following 'chains' in this directed
#   graph you might find hits which continue to drift for some time or change drift rate (maybe sinusoidally?)

# Parameters of search
max_drift_time_to_search = 10 * 60 # in seconds

# Setup dataframe to flag hits which are validated by search
full_coherent["valid"] = False

# Do search within each source
for source_name, source_group in full_coherent.groupby('source_name'):
    # Group by time and figure out what all the times observed are
    time_groups = source_group.groupby('tstart_h')
    time_names = list(time_groups.groups.keys())

    # Look at all times for this source which have a following time (all but the last)
    for t_idx in range(0, len(time_names) - 1):
        this_time = time_names[t_idx]
        next_time = time_names[t_idx + 1]
        dt = (next_time - this_time).total_seconds()

        # If the source was observed again within 10 minutes, look for
        # signals which drifted in the next observation time
        if dt <= max_drift_time_to_search:
            time_group = time_groups.get_group(this_time)
            next_time_group = time_groups.get_group(next_time)
            for i, hit in time_group.iterrows():
                # Ignore zero drift rate signals
                if hit.signal_drift_rate != 0:
                    # Compute some useful quantities
                    drift = (dt * hit.signal_drift_rate) * 1e-6 # Total drift in MHz
                    sigma_drift = max((dt * sigma_drift_rate(hit)) * 1e-6, 2 * 1e-6) # Error in drift in Mhz
                    expected_new_frequency = hit.signal_frequency + drift # Where we expect it to drift to in Mhz

                    # Get candidate hits from the next time
                    candidates = next_time_group[(next_time_group.signal_frequency > expected_new_frequency - sigma_drift) &
                                                (next_time_group.signal_frequency < expected_new_frequency + sigma_drift)]
                    
                    # If there was a match between this hit and a hit in the target range, validate them both in the full dataset
                    full_coherent.loc[full_coherent.id == hit.id, 'valid'] = True
                    full_coherent.loc[candidates.index, 'valid'] = True

results = full_coherent["id"][full_coherent["valid"]]
np.save("/home/nstieg/BL-COSMIC-2024-proj/filters/filter11/run_filter_11_coherent_results.npy", results.values)