# Match SurveyIDs across 2 files
This notebook checks which SurveyIDs are present in one or both files

In [1]:
import pandas as pd

from collections import defaultdict
from pprint import pprint
from ipyfilechooser import FileChooser
from IPython.display import display

#### Find matching SurveyIds in two files
Further below is the work in progress

In [18]:
def get_df_surveyIDs(file_path):
    if file_path.endswith(".csv"):
        df = pd.read_csv(file_path)
    else:
        df = pd.read_excel(file_path)
    return df["SurveyID"].dropna()

In [19]:
def find_matching_surveyIDs(file_path_1, file_path_2):
    f1_surveyIDs = get_df_surveyIDs(file_path_1)
    f2_surveyIDs = get_df_surveyIDs(file_path_2)
    
    common = set(f1_surveyIDs) & set(f2_surveyIDs)
    only_in_f1 = set(f1_surveyIDs) - set(f2_surveyIDs)
    only_in_f2 = set(f2_surveyIDs) - set(f1_surveyIDs)
    
    common_list = sorted(list(common))
    only_in_f1_list = sorted(list(only_in_f1))
    only_in_f2_list = sorted(list(only_in_f2))
    
    print(f"Reviewing files {file_path_1} and {file_path_2}" )
    print(f"The two files have the following {len(common)} SurveyIds in common:")
    print(common_list)
    print(f"The {len(only_in_f1)} SurveyIDs present only in {file_path_1} are:")
    print(only_in_f1_list)
    print(f"The {len(only_in_f2)} SurveyIDs present only in {file_path_2} are:")
    print(only_in_f2_list)
    return common, only_in_f1, only_in_f2

In [20]:
file_1_chooser = FileChooser(title='<b>Select the first file to compare:</b>')
file_1_chooser.filter_pattern = ['*.csv', '*.xls*']
display(file_1_chooser)


FileChooser(path='/Users/kalindi/Desktop/dt/spyfish/notebooks', filename='', title='<b>Select the first file t…

In [25]:
file_path_1 = file_1_chooser.selected
assert file_path_1 != None, "Select file 1 in the cell above."
file_path_1

'/Users/kalindi/Desktop/dt/spyfish/og/IDcomparison/List of files to merge.xlsx'

In [26]:
file_2_chooser = FileChooser(title='<b>Select the second file to compare:</b>')
file_2_chooser.filter_pattern = ['*.csv', '*.xls*']
display(file_2_chooser)

FileChooser(path='/Users/kalindi/Desktop/dt/spyfish/notebooks', filename='', title='<b>Select the second file …

In [27]:
file_path_2 = file_2_chooser.selected
assert file_path_2 != None, "Select file 2 in the cell above."
file_path_2

'/Users/kalindi/Desktop/dt/spyfish/og/IDcomparison/surveys_buv_doc.xlsx'

In [28]:
# example usage
common, only_in_f1, only_in_f2 = find_matching_surveyIDs(file_path_1, file_path_2)

Reviewing files /Users/kalindi/Desktop/dt/spyfish/og/IDcomparison/List of files to merge.xlsx and /Users/kalindi/Desktop/dt/spyfish/og/IDcomparison/surveys_buv_doc.xlsx
The two files have the following 32 SurveyIds in common:
['AHE_20160819_BUV', 'AHE_20180301_BUV', 'AHE_20200301_BUV', 'AHE_20220422_BUV', 'AHE_20230515_BUV', 'AHE_20240501_BUV', 'AKA_20170223_BUV', 'AKA_20181216_BUV', 'AKA_20210127_BUV', 'AKA_20230201_BUV', 'ANG_20210313_BUV', 'CRP_20220407_BUV', 'HOR_20211122_BUV', 'HOR_20240408_BUV', 'KOK_20240219_BUV', 'OKU_20201001_BUV', 'OKU_20230327_BUV', 'PAR_20120224_BUV', 'PKI_20240701_BUV', 'POU_20170223_BUV', 'POU_20181216_BUV', 'POU_20210127_BUV', 'POU_20230201_BUV', 'RON_20210125_BUV', 'SLI_20110413_BUV', 'TAW_20220407_BUV', 'TON_20211026_BUV', 'TON_20221205_BUV', 'TUH_20200830_BUV', 'TUH_20210309_BUV', 'TUH_20240307_BUV', 'WGI_20220518_BUV']
The 13 SurveyIDs present only in /Users/kalindi/Desktop/dt/spyfish/og/IDcomparison/List of files to merge.xlsx are:
['AHE_20210301_BU

### Find potential pairs
Combine SurveyIDs with the same Marine reserve code. 

Check visually to see if there are any small date discrepancies indicating that they are the same SurveyID

In [29]:
# create nested list to be used with the defaultdict in potential_pairn
def nl():
    return [[],[]]
    
def potential_pairs(survey_list_1, survey_list_2):
    possible_pairs = defaultdict(nl)
    for s_id in survey_list_1:
        possible_pairs[s_id[:3]][0].append(s_id)
        
    for s_id in survey_list_2:
        possible_pairs[s_id[:3]][1].append(s_id)

    possible_matches = {}
    for k, (s1, s2) in possible_pairs.items():
        if s1 != [] and s2 != []:
            possible_matches[k] = [s1, s2]
    return possible_matches

print("Potential pairs between SurveyIDs only found in the two files:")
pprint(potential_pairs(only_in_f1, only_in_f2)) 
print("Potential pairs between SurveyIDs found between the first file and the SurveyIDs in common")
pprint(potential_pairs(common, only_in_f1)) 
print("Potential pairs between SurveyIDs found between the first file and the SurveyIDs in common")
pprint(potential_pairs(common, only_in_f2)) 



Potential pairs between SurveyIDs only found in the two files:
{'KAP': [['KAP_20210201_BUV', 'KAP_20231018_BUV'], ['KAP_20200726_BUV']]}
Potential pairs between SurveyIDs found between the first file and the SurveyIDs in common
{'AHE': [['AHE_20200301_BUV',
          'AHE_20220422_BUV',
          'AHE_20180301_BUV',
          'AHE_20230515_BUV',
          'AHE_20240501_BUV',
          'AHE_20160819_BUV'],
         ['AHE_20210301_BUV']],
 'HOR': [['HOR_20240408_BUV', 'HOR_20211122_BUV'], ['HOR_20231201_BUV']],
 'KOK': [['KOK_20240219_BUV'], ['KOK_20230219_BUV']],
 'PKI': [['PKI_20240701_BUV'], ['PKI_20150501_BUV']],
 'RON': [['RON_20210125_BUV'], ['RON_20230130_BUV']]}
Potential pairs between SurveyIDs found between the first file and the SurveyIDs in common
{'SLI': [['SLI_20110413_BUV'], ['SLI_20220228_BUV', 'SLI_20240124_BUV']]}
