### Generate List of Data Files and Write to CSV

#### There are 4 basic data types

* `mp4`
  - RGB frames
* `avi`
  - depth images
* `ogg`
  - audio file
* `log`
  - information about data files. Format seems to vary, so check

### Import Some Standard Packages

In [1]:
import logging
import glob
from typing import List, Tuple, Union
from ast import literal_eval
import numpy as np
import cv2
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from IPython.display import display, HTML
from itertools import *
import os
import pandas as pd
import scipy.special as sc
import dotenv
import progressbar
dotenv.load_dotenv(dotenv.find_dotenv())
os.getcwd()

'/Users/fisher/projects/NIH/git/nihmarmoset/notebooks/exploratory'

### import some local packages

This may not work unless 

1. setup.py is in the repo, *AND* 
2. `pip install -e .` was run

In [2]:
#from src.data.dataset import load_method_by_otu_type
#from src.data.dataset import Dataset

### Print some of the environment variables

In [3]:
print(f'  PROJECT_ROOT: {os.getenv("PROJECT_ROOT")}')
print(f'  PROJECT_DATA: {os.getenv("PROJECT_DATA")}')
print(f'PROJECT_MODELS: {os.getenv("PROJECT_MODELS")}')
print(f'    PYTHONPATH: {os.getenv("PYTHONPATH")}')
print(f'PYTHONBUFFERED: {os.getenv("PYTHONUBUFFERED")}')

  PROJECT_ROOT: /Users/fisher/projects/NIH/git/nihmarmoset
  PROJECT_DATA: /Volumes/JWFExtDat/data/marmoset/data
PROJECT_MODELS: /Volumes/JWFExtDat/data/marmoset/models
    PYTHONPATH: /usr/local/lib:./:/Users/fisher/projects/NIH/git/nihmarmoset/src
PYTHONBUFFERED: 1


### Define some local functions
* `DataFileListbyType` - generate list of data files available in various subdirectories where `typstr` is one of `otu`, `effl`, or `wthr`.
* `datfildf` - create data frame that contains details of each data file.
* `cond_literal` - conditional use of `literal_eval` for fields with things like `tuples` or `arrays`
* `getfilfromdf` - extracts full file name from datfildf.
* `gethdrrowsfromdf` - extracts `headerrows` argument from datfildf.

In [4]:
def DataFileListbyType(tgtextlst,subdir):
    datdir = f'{os.getenv("PROJECT_DATA")}/{subdir}'
    #
    # Get list of subdirectories in subdir
    #
    subdirlst = next(os.walk(f'{datdir}'))[1]
    #
    # Initialize lists
    #
    fillst = []
    dirlst = []
    extlst = []
    #
    # iterate through subdirectories and find files that have extensions
    #
    for tmpdir in subdirlst:
        fullpath = f'{datdir}/{tmpdir}/'
        # used `glob` to allow for wildcards and avoid extraneous dotfiles
        dirlst = dirlst+[os.path.dirname(sub).replace(datdir,subdir) 
                         for sub in glob.glob(f'{datdir}/{tmpdir}/[0-9,a-z,A-Z]*.*') 
                         if os.path.splitext(sub)[1].replace('.','') in tgtextlst]
        fillst = fillst+[os.path.basename(sub) 
                         for sub in glob.glob(f'{datdir}/{tmpdir}/[0-9,a-z,A-Z]*.*') 
                         if os.path.splitext(sub)[1].replace('.','') in tgtextlst]
        extlst = extlst+[os.path.splitext(sub)[1].replace('.','') 
                         for sub in glob.glob(f'{datdir}/{tmpdir}/[0-9,a-z,A-Z]*.*') 
                         if os.path.splitext(sub)[1].replace('.','') in tgtextlst]
    datfildf = pd.DataFrame(data={"subdir":dirlst,"filename":fillst,"extension":extlst})
    return datfildf
def datfildf(fillst,typstr,rawsubdir):
    df = pd.DataFrame(fillst,columns=['raw_name'])
    df.insert(0,'raw_subdir',f'{rawsubdir}/{typstr}')
    df['raw_type'] = ''
    df['skiprowslist'] = ''
    df['header'] = ''
    df['ldmethod'] = ''
    for index, row in df.iterrows():
        extloc = row['raw_name'].find('.')
        extstr = row['raw_name'][extloc+1:]
        row['raw_type'] = typstr
        if row['raw_type'] == 'otu':
            row['skiprowslist'] = [1,2,3,4]
            row['header'] = [0,1,2,3,4]
        row['ldmethod'] = load_method_by_otu_type(extstr)
    return df
def cond_literal(str):
    return str if str == '' else literal_eval(str)
def getfilfromdf(indx,fildf):
    rawdir = f"{os.getenv('PROJECT_DATA')}/{fildf['raw_subdir'].iloc[indx]}"
    rawfilin = fildf['raw_name'].iloc[indx]
    return f'{rawdir}/{rawfilin}'
def gethdrrowsfromdf(indx,fildf):
    return fildf['header'].iloc[indx]

### Get list of various data sets 

* `extsubdir` is the *relative* path to the data
   * Assumes a base directory of `{PROJECT_DATA}/`  
* `extlst` list of extensions used to filter list
   * Using `['mp4','avi','ogg','log']` below because those are the primary data files of interest.
   * The function defined here specifically looks for subdirectories **one** level down from `extsubdir`. Could probably modify this procedure when there is interest in searching the directory to tree to an arbitrary depth.

In [5]:
extsubdir = 'external/rgbda_record_data'
extlst = ['mp4','avi','ogg','log']
datfildf = DataFileListbyType(extlst,extsubdir)

### Display data frame containing list of files

In [6]:
display(HTML(datfildf.to_html()))

Unnamed: 0,subdir,filename,extension
0,external/rgbda_record_data/20170524,recording-2017-05-24_15_14_33.mp4,mp4
1,external/rgbda_record_data/20170524,recording-2017-05-24_15_14_33.log,log
2,external/rgbda_record_data/20170524,recording-2017-05-24_15_27_58.log,log
3,external/rgbda_record_data/20170524,recording-2017-05-24_15_27_58.mp4,mp4
4,external/rgbda_record_data/20170524,recording-2017-05-24_15_14_33.avi,avi
5,external/rgbda_record_data/20170524,recording-2017-05-24_15_14_33.ogg,ogg
6,external/rgbda_record_data/20170524,recording-2017-05-24_15_27_58.ogg,ogg
7,external/rgbda_record_data/20170524,recording-2017-05-24_15_27_58.avi,avi
8,external/rgbda_record_data/20170224,recording-2017-02-24_14_23_25.ogg,ogg
9,external/rgbda_record_data/20170224,recording-2017-02-24_14_26_16.ogg,ogg


### Group groupings

In [193]:
display(HTML(datfildf.sort_values(by=['subdir']).to_html(index=False,notebook=True)))

subdir,filename,extension
external/rgbda_record_data/20170224,recording-2017-02-24_14_23_25.ogg,ogg
external/rgbda_record_data/20170224,recording-2017-02-24_14_26_16.ogg,ogg
external/rgbda_record_data/20170224,recording-2017-02-24_14_26_16.avi,avi
external/rgbda_record_data/20170224,recording-2017-02-24_14_23_25.avi,avi
external/rgbda_record_data/20170224,recording-2017-02-24_14_26_16.mp4,mp4
external/rgbda_record_data/20170224,recording-2017-02-24_14_23_25.log,log
external/rgbda_record_data/20170224,recording-2017-02-24_14_23_25.mp4,mp4
external/rgbda_record_data/20170224,recording-2017-02-24_14_26_16.log,log
external/rgbda_record_data/20170524,recording-2017-05-24_15_14_33.mp4,mp4
external/rgbda_record_data/20170524,recording-2017-05-24_15_14_33.log,log


In [196]:
display((pd.MultiIndex.from_frame(datfildf)))

MultiIndex([('external/rgbda_record_data/20170524', ...),
            ('external/rgbda_record_data/20170524', ...),
            ('external/rgbda_record_data/20170524', ...),
            ('external/rgbda_record_data/20170524', ...),
            ('external/rgbda_record_data/20170524', ...),
            ('external/rgbda_record_data/20170524', ...),
            ('external/rgbda_record_data/20170524', ...),
            ('external/rgbda_record_data/20170524', ...),
            ('external/rgbda_record_data/20170224', ...),
            ('external/rgbda_record_data/20170224', ...),
            ('external/rgbda_record_data/20170224', ...),
            ('external/rgbda_record_data/20170224', ...),
            ('external/rgbda_record_data/20170224', ...),
            ('external/rgbda_record_data/20170224', ...),
            ('external/rgbda_record_data/20170224', ...),
            ('external/rgbda_record_data/20170224', ...)],
           names=['subdir', 'filename', 'extension'])

### Store file list in dataframe

The idea of creating this data frame was motivated by contending with multiple file formats (even for the same data type). This data frame would contain the list of files along with the functions parameters needed to read them and reformat when moving from `external` to `raw`. The intent was for the file to be in `.csv` format and to be human-editable (assuming an automated approach wouldn't be available). There is the danger of over-writing manual edits, so the designed process was not robust.

However, the formats have been standardized somewhat, so this may be super-fluous.

* the external data source are password protected `.xlsx` spread sheets, these have been *manually* saved as `.csv` under the `raw` subdirectory.

In [None]:
otufildf = datfildf(otudatfillst,otustr,rawsubdir)
efflfildf = datfildf(effldatfillst,efflstr,rawsubdir)
wthrfildf = datfildf(wthrdatfillst,wthrstr,rawsubdir)
display(HTML(otufildf.to_html()))
display(HTML(wthrfildf.to_html()))
display(HTML(efflfildf.to_html()))

### Merge [etu, effl, wthr]fildf into a single data frame

In [None]:
datfildf = pd.concat([otufildf,efflfildf,wthrfildf])
display(datfildf)

### Write File List to CSV File
* Idea is to include parameters that are relevant to loading a particular file, although, this was only an issue when the "external" data was in varying formats.
* For now, a convenient listing of the available data files.
* If the idea is to run this *once* (in a notebook or a script), still need to work out process that ensures it does not overwrite any manual edits.
* Writing this to the `raw` directory might be questionable.

In [None]:
datcsvfil = f'{os.getenv("PROJECT_DATA")}/{rawsubdir}/datfildf.csv'
print(f'CSV output file: {datcsvfil}')
datfildf.to_csv(datcsvfil,na_rep='',index=False)

#### I am leaving the next two cells in place because they provide examples of 
* using `cond_literal`/`literal_eval` when reading `.csv`
  * this option is needed when reading the file back in so that fields with arrays and tuples are read as such. The converter is defined above.
  * notebooks that load these files should start with this csv file.

#### Re-read file back in as sanity check
* Originally, this was a *different* file
    * the intent being that it would be edited by hand (if necessary) to provide parameters for reading
    * the script could only be run once, though, since there isn't error checking (currently). Otherwise, hand edits would be over-written (oh, my!)

In [None]:
datfil2df = pd.read_csv(datcsvfil,converters={'skiprowslist': cond_literal,'header': cond_literal})
display(HTML(datfil2df.to_html()))

### EXAMPLE - Load index 5
* This happens to be the smallest otu file.
* **Everything** after the next cell are to illustrate operations after loading the data.
* The cells above and below are where most analysis notebooks (or scripts) would start **presuming** the csv data file has been created. 

In [None]:
otuindx = 5
otufil = getfilfromdf(otuindx,datfildf)
otuhdrrows = gethdrrowsfromdf(otuindx,datfildf)
otuhdrdf = ld_otu_header_df(otufil,headerrows=otuhdrrows)
otudf= ld_otu_df(otufil,headerrows=otuhdrrows)

#### Loading OTU Files results in a df with a multi-index over columns
 * `ld_otu_header_df` grabs the header rows and creates a df over the multi-indices.

   This is useful for creating various plotting indices.

In [None]:
display(HTML(otuhdrdf.to_html()))

### Display the unique values found in *some* of the indices

In [None]:
display(otuhdrdf['Collection Site'].unique())
display(otuhdrdf['Collection Season'].unique())

#### Display the data frame itself (with multi-indices)

In [None]:
display(otudf)

### Filter multi-index by key

Some examples of filtering the multi-indices.

In [None]:
otudf.xs('TH01',level=3,axis=1)

### Extract Abundance Counts

In [None]:
otudatTH01 = otudf.xs('TH01',level=3,axis=1).to_numpy()
otudatTH06 = otudf.xs('TH06',level=3,axis=1).to_numpy()
otudatTH07 = otudf.xs('TH07',level=3,axis=1).to_numpy()
display(otudatTH01.shape)
display(np.sum(otudatTH01,axis=0).shape)
display(otudf.xs(('TH06','Winter'),axis=1,level=[3,4]))

In [None]:
#dftmp = otudf.xs(('TH06','Winter'),axis=1,level=[3,4])
dftmp = otudf.xs('TH06',axis=1,level=3)
display(dftmp.iloc[0:250])

In [None]:
sns.clustermap(dftmp.iloc[0:250],row_cluster=False,metric="correlation")

In [None]:
dftmp['12/3/18'].plot.bar(stacked=True)

In [None]:
nrmotudatTH01=otudatTH01/np.sum(otudatTH01,axis=0)
nrmotudatTH06=otudatTH06/np.sum(otudatTH06,axis=0)
nrmotudatTH07=otudatTH07/np.sum(otudatTH07,axis=0)
plt.figure(figsize=default_figsize)
plt.plot(np.sum(otudatTH01,axis=1))

In [None]:
plot_cumulative_local_weights(np.transpose(nrmotudatTH01),figsize=(16,8),ground_truth=True)

In [None]:
plot_cumulative_local_weights(np.transpose(nrmotudatTH06),figsize=(16,8))

In [None]:
plot_cumulative_local_weights(np.transpose(nrmotudatTH07),figsize=(16,8))

### HDP Inference