**Description:** Spot renaming errors in the larval schooling project by comparing file names and file sizes between two directories that should contain the same videos.

In [48]:
import sys, os
import os.path as osp
from glob import glob
import pandas as pd

In [122]:
files1 = sorted(glob('/home/yaouen/Desktop/larval_schooling/raw_videos/google-drive/*.avi'))
files2 = sorted(glob('/media/yaouen/trilab_data/larval_schooling/raw_videos/*.avi') \
         + glob('/media/yaouen/trilab_data/larval_schooling/raw_videos/short/*.avi'))


info1 = [ (f,osp.splitext(osp.basename(f))[0],os.stat(f).st_size) for f in files1 ]
info2 = [ (f,osp.splitext(osp.basename(f))[0],os.stat(f).st_size) for f in files2 ]

# def name_match(f1,f2,n=4):
#     s1 = osp.splitext(osp.basename(f1))[0].split('_')
#     s2 = osp.splitext(osp.basename(f2))[0].split('_')
#     return s1[:n]==s2[:n]

# def size_match(f1,f2):
#     return os.stat(f1).st_size==os.stat(f2).st_size

In [123]:
''' 
Define a match as same size AND same name up to the 5th underscore
(i.e., same population, day, age, group, number of fish, and trial letter).

This labels old files whose name had a pattern error before the 5th underscore
as mismatches, hence the longish mismatch list.
'''

def name_match(n):
    # Part of the filename to use when looking for a match. 
    # Ignore case errors and -/_ errors.
    return n.lower().replace('-','_').split('_')[:5]

matches = []
mismatches = []
for f1,n1,s1 in info1:
    name_matches = [n2 for f2,n2,s2 in info2 if name_match(n1)==name_match(n2)]
    size_matches = [n2 for f2,n2,s2 in info2 if s1==s2]
    if len(name_matches)==1 and len(size_matches)==1 and name_matches[0]==size_matches[0]:
        matches.append([n1,name_matches[0]])
    else:
        mismatches.append([n1,name_matches,size_matches])

# pd.DataFrame(matches)

In [140]:
mismatch_dict = { 'No matching size': [], 
                  'Name match and size match are different': [],
                  'Other': []
                }

for f,name,size in mismatches:
    mtype = 'Other'
    if len(size)==0:
        mtype = 'No matching size'
    elif len(name)==1 and len(size)==1:
        mtype = 'Name match and size match are different'
    mismatch_dict[mtype].append((f,name,size))


for mtype in mismatch_dict.keys():
    print(f'{mtype}:\n')
    for f,name,size in mismatch_dict[mtype]:
        print(' ',f)
        print(' ','name:',*name)
        print(' ','size:',*size)
        print()
    print('-'*20)


No matching size:

  SF_fri_14dpf_groupA_n1-0000
  name: SF_Fri_14dpf_GroupA_n1_20200703_1500
  size:

--------------------
Name match and size match are different:

  SF_Fri_7dpf_GroupA_n2_2020-06-26-123805-0000
  name: SF_fri_7dpf_groupA_n2_20200626_1300
  size: SF_Fri_7dpf_GroupA_n1b_20200626_1330

  SF_Fri_7dpf_GroupA_n5_2020-06-26-120309-0000
  name: SF_Fri_7dpf_groupA_n5_20200626_1300
  size: SF_Fri_7dpf_GroupA_n2b_20200626_1300

  SF_Fri_7dpf_groupA_n2b_2020-06-26-100310-0000
  name: SF_Fri_7dpf_GroupA_n2b_20200626_1300
  size: SF_Fri_7dpf_groupA_n5_20200626_1300

  SF_Sat_14dpf_GroupC_n1b_2020-06-27-164823-0000
  name: SF_Sat_14dpf_GroupC_n1b_20200613_1525
  size: SF_Sat_28dpf_GroupC_n1b_20200627_1645

  SF_Sat_7dpf_GroupD_n2b_CORRUPT_
  name: SF_Sat_7dpf_GroupD_n2b_20200606_1435_CORRUPT
  size: SF_Sat_7dpf_GroupD_n2_20200606_1435_CORRUPT

  SF_fri_7dpf_groupA_n1_2020-06-26-133754-0000
  name: SF_Fri_7dpf_groupA_n1_20200626_1330
  size: SF_fri_7dpf_groupA_n1c_20200626_1300

---