In [93]:
import glob
import json
import pandas as pd
import os


In [3]:
DATA_PATH = "./DATA"

In [161]:
class Annotest:

    SEP = "-"
    # FKEYS = ["app_code 0", "sub_id 1", "file_class 2", "file_ID 3", "comment 4", "file_format 5", "file_name 6", "file_path 7"]
    FKEYS = ["app_code", "sub_id", "file_class", "file_ID", "comment", "file_format", "file_name", "file_path"]

    ANOT_FILE_TYPES = ["txt", "csv"]
    ANOT_LABELS = ["aa", "ee", "uu", "oo", "bb", "ii", "xx"]
    ALL_FILES_DF: pd.DataFrame

    SUBJECTS: tuple[int, list]

    def __init__(self, dataset_path: str) -> None:
        all_files = glob.glob(f"{dataset_path}/*/pnoistor_*")

        self.ALL_FILES_DF = self.make_files_df(all_files)

        self.SUBJECTS = self.get_subject_list()

    def make_files_df(self, all_files: list) -> pd.DataFrame:
        files = [self.file_dict(fp) for fp in all_files]
        return pd.DataFrame(files)
    
    def make_metadata_df(self):
        meta_file_df = self.get_metadata_file_df()

        meta_data = [
            mf.to_dict() | self.read_json(mf[self.FKEYS[7]])
            
            for _, mf in meta_file_df.iterrows()
        ]

        return pd.DataFrame(meta_data)
    
    def read_json(self, fpath):
        with open(fpath, "r") as m:
            return json.load(m)["subjectBiodata"]

    def file_dict(self, path) -> dict:
        _, fname = os.path.split(path)
        fitems = fname.replace(".", self.SEP).split(self.SEP) + [fname, path]
        fdict = dict(zip(self.FKEYS, fitems))

        return fdict

    def get_files_df(self) -> pd.DataFrame:
        return self.ALL_FILES_DF

    def get_subject_list(self) -> list:
        s = pd.unique(self.ALL_FILES_DF[self.FKEYS[1]])
        return (len(s), s)
    
    def get_subject_file_df(self, sid: str) -> list:
        filt = (self.ALL_FILES_DF[self.FKEYS[1]] == sid)
        return self.ALL_FILES_DF.loc[filt]

    def get_metadata_file_df(self) -> pd.DataFrame:
        filt = (self.ALL_FILES_DF[self.FKEYS[2]].str.contains("META"))
        filt &= (
            self.ALL_FILES_DF[self.FKEYS[5]].str.contains("json")
        )
        return self.ALL_FILES_DF.loc[filt]

    def get_aud_file_df(self, when="") -> pd.DataFrame:
        filt = (self.ALL_FILES_DF[self.FKEYS[2]].str.contains("BA_")) 
        filt &= (
            self.ALL_FILES_DF[self.FKEYS[5]].str.contains("wav")
        )
        if len(when) > 0:
            filt &= self.ALL_FILES_DF[self.FKEYS[2]].str.contains(when)
        return self.ALL_FILES_DF.loc[filt]

    def get_pft_file_df(self, when="") -> pd.DataFrame:
        filt = (self.ALL_FILES_DF[self.FKEYS[2]].str.contains("PFT_")) 
        filt &= (
            self.ALL_FILES_DF[self.FKEYS[5]].str.contains("csv")
        )
        if len(when) > 0:
            filt &= self.ALL_FILES_DF[self.FKEYS[2]].str.contains(when)
        return self.ALL_FILES_DF.loc[filt]

    def get_aud_anote_file_df(self, when="") -> pd.DataFrame:
        filt = (self.ALL_FILES_DF[self.FKEYS[2]].str.contains("BA_")) 
        filt &= (
            self.ALL_FILES_DF[self.FKEYS[5]].str.contains("txt")
        )
        if len(when) > 0:
            filt &= self.ALL_FILES_DF[self.FKEYS[2]].str.contains(when)
        return self.ALL_FILES_DF.loc[filt]

    def get_pft_anote_file_df(self, when="") -> pd.DataFrame:
        filt = (self.ALL_FILES_DF[self.FKEYS[2]].str.contains("PFT_")) 
        filt &= (
            self.ALL_FILES_DF[self.FKEYS[5]].str.contains("csv")
        )
        if len(when) > 0:
            filt &= self.ALL_FILES_DF[self.FKEYS[2]].str.contains(when)
        return self.ALL_FILES_DF.loc[filt]
    

    # Tests

    def get_anotless(self):
        MATCH_KEY = "match_key"
        f_aud = self.get_aud_file_df()
        f_anot = self.get_aud_anote_file_df()

        f_aud.loc[:, MATCH_KEY] = f_aud.loc[:, self.FKEYS[1]] + f_aud.loc[:, self.FKEYS[2]] + f_aud.loc[:, self.FKEYS[3]]
        f_anot.loc[:, MATCH_KEY] = f_anot.loc[:, self.FKEYS[1]] + f_anot.loc[:, self.FKEYS[2]] + f_anot.loc[:, self.FKEYS[3]]

        f_merge = f_aud.merge(f_anot, how="left", on=MATCH_KEY)

        return f_merge[f_merge["file_name_y"].isna()]




annotest = Annotest(DATA_PATH)

annotest.ALL_FILES_DF

annotest.get_subject_file_df(annotest.SUBJECTS[1][2])



Unnamed: 0,app_code,sub_id,file_class,file_ID,comment,file_format,file_name,file_path
26,pnoistor_feb2023,viddrrn_cad134b1,LBA_before_RU,c49a,comnt,txt,pnoistor_feb2023-viddrrn_cad134b1-LBA_before_R...,./DATA/viddrrn_cad134b1/pnoistor_feb2023-viddr...
27,pnoistor_feb2023,viddrrn_cad134b1,LBA_after_RU,b647,comnt,txt,pnoistor_feb2023-viddrrn_cad134b1-LBA_after_RU...,./DATA/viddrrn_cad134b1/pnoistor_feb2023-viddr...
28,pnoistor_feb2023,viddrrn_cad134b1,LBA_after_RU,b647,comnt,wav,pnoistor_feb2023-viddrrn_cad134b1-LBA_after_RU...,./DATA/viddrrn_cad134b1/pnoistor_feb2023-viddr...
29,pnoistor_feb2023,viddrrn_cad134b1,META,f4e3,comnt,json,pnoistor_feb2023-viddrrn_cad134b1-META-f4e3-co...,./DATA/viddrrn_cad134b1/pnoistor_feb2023-viddr...
30,pnoistor_feb2023,viddrrn_cad134b1,LBA_before_LL,477d,comnt,txt,pnoistor_feb2023-viddrrn_cad134b1-LBA_before_L...,./DATA/viddrrn_cad134b1/pnoistor_feb2023-viddr...
31,pnoistor_feb2023,viddrrn_cad134b1,PFT_before,fb03,comnt,pdf,pnoistor_feb2023-viddrrn_cad134b1-PFT_before-f...,./DATA/viddrrn_cad134b1/pnoistor_feb2023-viddr...
32,pnoistor_feb2023,viddrrn_cad134b1,PFT_after,b3f9,comnt,csv,pnoistor_feb2023-viddrrn_cad134b1-PFT_after-b3...,./DATA/viddrrn_cad134b1/pnoistor_feb2023-viddr...
33,pnoistor_feb2023,viddrrn_cad134b1,LBA_after_LL,f4fc,comnt,wav,pnoistor_feb2023-viddrrn_cad134b1-LBA_after_LL...,./DATA/viddrrn_cad134b1/pnoistor_feb2023-viddr...
34,pnoistor_feb2023,viddrrn_cad134b1,VBA_after,e7dc,comnt,txt,pnoistor_feb2023-viddrrn_cad134b1-VBA_after-e7...,./DATA/viddrrn_cad134b1/pnoistor_feb2023-viddr...
35,pnoistor_feb2023,viddrrn_cad134b1,LBA_after_LL,f4fc,comnt,txt,pnoistor_feb2023-viddrrn_cad134b1-LBA_after_LL...,./DATA/viddrrn_cad134b1/pnoistor_feb2023-viddr...


In [151]:
# annotest.get_anotless()[annotest.get_anotless().value_counts() > 1]

annotest.get_anotless()["match_key"].value_counts().to_frame()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


Unnamed: 0,match_key
amzpiym_84041e0a7d9eLBA_before_RU,1
zqpidbg_8ba6669d12feLBA_before_RU,1
wwxrzdz_be74bef15eaeVBA_before,1
wwxrzdz_be74bef1b0c6LBA_after_RU,1
zowddvk_5df23ac29a84LBA_before_LL,1
