# Finding hits at the same frequency in the same sources

Noah Stiegler

7/9/24

If a source records two hits at exactly the same frequency but at two different times, we can be somewhat certain this is RFI because we think real ET signals will drift due to the doppler shifting of light based on differing accelerations between the cosmic motions of the Earth, the Sun, and the transmitting source. It would be super unlikely for these two accelerations to exactly sync up, and unless the ET source was directing a signal at Earth and deliberately trying to correct for this doppler drift with their transmission (meaning they're transmiting directly to the Earth itself directly), there will be some drift. However, with COSMIC, we can't make cuts based on a 0 drift rate signal because our resolution to detecting drift rate isn't sensitive enough to tell the difference between something drifting at a rate of 0.0000001 Hz/s (RFI likely) or something more like 0.1 Hz/s (note the smallest magnitude nonzero drift rate in the coherent hits is 0.242531920 Hz/s). However, we can artificially increase our sensitivity to low (zero) drift rates by looking for the same signal across multiple times of observation. If we see multiple hits across time at exactly the same frequencies (or within a small narrow band), then it's likely RFI.

### Setup

In [46]:
# Import useful packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy.sparse import csr_array
from scipy.sparse import lil_array
from scipy.sparse import save_npz
from scipy.sparse import load_npz
import os
from datetime import datetime, timedelta

In [47]:
# Read in the data

# Check which server we're on (in case the data is in different places on different servers)
import socket
hostname = socket.gethostname()

# Get paths to data
if hostname == "blpc1" or hostname == "blpc2":
    full_dataset_path = "/datax/scratch/nstieg/25GHz_higher.pkl"
    coherent_dataset_path = "/datax/scratch/nstieg/25GHz_higher_coherent.pkl"
    incoherent_dataset_path = "/datax/scratch/nstieg/25GHz_higher_incoherent.pkl"
else:
    raise Exception("Data path not known")

# Read in data
coherent = pd.read_pickle(coherent_dataset_path)
# incoherent = pd.read_pickle(incoherent_dataset_path)
# df = pd.read_pickle(full_dataset_path)

### Look for hits within each source which are at the same frequencies

In [48]:
# Set the threshold distance in hz to call two hits 'adjacent' and record their relative distances
threshold_hz = 1000
threshold = threshold_hz * 1e-6 # in MHz

##### Setup Algorithm

In [49]:
# Find adjacent points (like algorithm described above)
# data: pandas series with reset index (index goes 0...n-1 consecutively)
# window_width: width of window to find adjacency in same units as data (ie MHz and MHz)
# Returns:
# (distances, mask) tuple
# distances is a scipy sparse lil_array of float distances between adjacent points
# mask is a scipy sparse lil_array of booleans indicating whether 
def find_adjacent_distances(data, window_width):
    # Sort the data by frequency
    sdata = data.sort_values() # sfs is sorted frequencies
    # Make sure to keep track of the original indices
    original_indices = sdata.index # Maps index in sfs to index in fs

    # Setup empty arrays, indices are original indices of data
    num_hits = len(data)
    mask = lil_array((num_hits, num_hits), dtype=bool) 
    distances = lil_array((num_hits, num_hits), dtype=np.float32) 

    # Find which hits are adjacent in order. Stop if we find one that isn't
    for i, datum in enumerate(sdata):
        j = i + 1 # Index of point to compare to
        while ((j < num_hits) and # Don't index off the end of the array
               (abs(sdata.iloc[j] - datum) <= window_width)): 
            # Find coordinates in the non-sorted list
            u = original_indices[i]
            v = original_indices[j]

            # Make sure it's upper triangular (might have to flip over diagonal)
            if u > v: u, v = v, u # swap u and v
            
            # Set elements in matrix
            mask[u, v] = True
            distances[u, v] = abs(sdata.iloc[j] - datum)

            # Check next point
            j += 1

    # Return data
    return distances, mask

##### Find Adjacent Hits

In [50]:
# Find adjacent his within data for each source individually
path = "/home/nstieg/BL-COSMIC-2024-proj/frequency_adjacency/adjacent_in_each_source/" # Place to save arrays
grouped = coherent.groupby('source_name')
for source, source_data in grouped:
    # Check if we've already done this one
    distances_file_path = path + f'{source}_within_{round(threshold_hz)}hz.distances.npz'
    mask_file_path = path + f'{source}_within_{round(threshold_hz)}hz.mask.npz'
    if not (os.path.exists(distances_file_path) and os.path.exists(mask_file_path)):
        # Log progress
        print(f"Starting source [{source}]")
        
        # Compute results
        frequencies = source_data["signal_frequency"].copy().reset_index(drop=True)
        distances, mask = find_adjacent_distances(frequencies, threshold)
        
        # Save results to file
        save_npz(distances_file_path, csr_array(distances))
        save_npz(mask_file_path, csr_array(mask))dd