<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Compare-SurveyIDs-across-2-files" data-toc-modified-id="Compare-SurveyIDs-across-2-files-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Compare SurveyIDs across 2 files</a></span><ul class="toc-item"><li><span><a href="#Select-files-to-compare" data-toc-modified-id="Select-files-to-compare-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Select files to compare</a></span></li><li><span><a href="#Read-the-files-and-find-matching-SurveyIDs:" data-toc-modified-id="Read-the-files-and-find-matching-SurveyIDs:-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Read the files and find matching SurveyIDs:</a></span></li></ul></li><li><span><a href="#Find-potential-pairs" data-toc-modified-id="Find-potential-pairs-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Find potential pairs</a></span></li></ul></div>

In [None]:
# Last changed 2024.12.11

# Compare SurveyIDs across 2 files

This notebooks is part of the 2024 Spyfish data cleaning process and it checks, given two tabular files, which SurveyIDs are present in one or both files.

In [None]:
import pandas as pd

from collections import defaultdict
from pprint import pprint
from ipyfilechooser import FileChooser
from IPython.display import display

## Select files to compare

In [None]:
file_1_chooser = FileChooser(title='<b>Select the first file to compare:</b>')
file_1_chooser.filter_pattern = ['*.csv', '*.xls*']
display(file_1_chooser)


In [None]:
file_path_1 = file_1_chooser.selected
assert file_path_1 != None, "Select file 1 in the cell above."
file_path_1

In [None]:
file_2_chooser = FileChooser(title='<b>Select the second file to compare:</b>')
file_2_chooser.filter_pattern = ['*.csv', '*.xls*']
display(file_2_chooser)

In [None]:
file_path_2 = file_2_chooser.selected
assert file_path_2 != None, "Select file 2 in the cell above."
file_path_2

## Read the files and find matching SurveyIDs:

In [None]:
# this assumes the column containinc SurveyIDs is called SurveyID
def get_df_surveyIDs(file_path):
    if file_path.endswith(".csv"):
        df = pd.read_csv(file_path)
    else:
        df = pd.read_excel(file_path)
    return df["SurveyID"].dropna()

In [None]:
def find_matching_surveyIDs(file_path_1, file_path_2):
    """Returns a dictionary with 3 entries: 
        common: a list containing SurveyIDs in common
        only_in_f1: a list of SurveyIDs present only in file 1
        only_in_f2:  a list of SurveyIDs present only in file 2
        """
    
    f1_surveyIDs = get_df_surveyIDs(file_path_1)
    f2_surveyIDs = get_df_surveyIDs(file_path_2)
    
    common = set(f1_surveyIDs) & set(f2_surveyIDs)
    only_in_f1 = set(f1_surveyIDs) - set(f2_surveyIDs)
    only_in_f2 = set(f2_surveyIDs) - set(f1_surveyIDs)
    
    common_list = sorted(list(common))
    only_in_f1_list = sorted(list(only_in_f1))
    only_in_f2_list = sorted(list(only_in_f2))
    
    print(f"Reviewing files {file_path_1} and {file_path_2}" )
    print(f"\nThe two files have the following {len(common)} SurveyIds in common:")
    print(common_list)
    print(f"\nThe {len(only_in_f1)} SurveyIDs present only in {file_path_1} are:")
    print(only_in_f1_list)
    print(f"\nThe {len(only_in_f2)} SurveyIDs present only in {file_path_2} are:")
    print(only_in_f2_list)
    
    
    return {"common": common, 
            "only_in_f1": only_in_f1, 
            "only_in_f2": only_in_f2}

In [None]:
# example usage
survey_comparison = find_matching_surveyIDs(file_path_1, file_path_2)
common = survey_comparison["common"]
only_in_f1 = survey_comparison["only_in_f1"]
only_in_f2 = survey_comparison["only_in_f2"]

# Find potential pairs
This next step combines SurveyIDs with the same Marine reserve code from two files, it allows us to check visually if there are any potential pairs, for example small date discrepancies).

In [None]:
# create nested list to be used with the defaultdict in potential_pairs
def nl():
    return [[],[]]
    
def potential_pairs(survey_list_1, survey_list_2):
    possible_pairs = defaultdict(nl)
    for s_id in survey_list_1:
        possible_pairs[s_id[:3]][0].append(s_id)
        
    for s_id in survey_list_2:
        possible_pairs[s_id[:3]][1].append(s_id)

    possible_matches = {}
    for k, (s1, s2) in possible_pairs.items():
        if s1 != [] and s2 != []:
            possible_matches[k] = [s1, s2]
    return possible_matches

print("Potential pairs between SurveyIDs only found in the two files:")
pprint(potential_pairs(only_in_f1, only_in_f2)) 
print("\nPotential pairs between SurveyIDs found between the first file and the SurveyIDs in common")
pprint(potential_pairs(common, only_in_f1)) 
print("\nPotential pairs between SurveyIDs found between the first file and the SurveyIDs in common")
pprint(potential_pairs(common, only_in_f2)) 

