In [96]:
import pandas as pd
import numpy as np

In [132]:
# This builds the path to the output.txt file that we extracted by piping the linux ls command's output
# I used ls -lh > output.txt in my Recordings folder

audio_dir = '/home/adkris1002/Documents/UBNA_Research/AudiomothStuff/Recordings&Code'
unit_dir = '/Audiomoth2Files'
loc_dir = '/Foliage2022'
sess_dir = '/20220710_unit2'
filename = '/Recordings/output.txt'

file_path = audio_dir+unit_dir+loc_dir+sess_dir+filename

In [136]:
# We read in the .txt file using the file path and a separator of spaces.
# We set header=None so read_csv does not look for a header
# We also skip the first row of the raw output because the first row is 'Total <size of all files>'

data= pd.read_csv(file_path, sep='\s+', skiprows=1, header=None)
print(data)

               0  1           2           3     4    5   6      7  \
0     -rwxr-xr-x  1  adkris1002  adkris1002  864K  Jul   9  12:46   
1     -rwxr-xr-x  1  adkris1002  adkris1002     0  Dec  31   1979   
2     -rwxr-xr-x  1  adkris1002  adkris1002  107M  Jul   9  12:59   
3     -rwxr-xr-x  1  adkris1002  adkris1002  856M  Jul   9  13:29   
4     -rwxr-xr-x  1  adkris1002  adkris1002  856M  Jul   9  13:59   
...          ... ..         ...         ...   ...  ...  ..    ...   
1766  -rwxr-xr-x  1  adkris1002  adkris1002  856M  Jul  12  10:59   
1767  -rwxr-xr-x  1  adkris1002  adkris1002  856M  Jul  12  11:29   
1768  -rwxr-xr-x  1  adkris1002  adkris1002  856M  Jul  12  11:59   
1769  -rwxr-xr-x  1  adkris1002  adkris1002  386M  Jul  12  12:13   
1770  -rw-rw-r--  1  adkris1002  adkris1002     0  Jul  13  14:38   

                        8  
0     20220709_194629.WAV  
1     20220709_194915.WAV  
2     20220709_195611.WAV  
3     20220709_200000.WAV  
4     20220709_203000.WAV  
...

In [137]:
# We clean up and only extract the rows and columns we need.
# We don't need the first 4 columns or the last row which is the size of the output.txt file

list_np = data.to_numpy()[:-1,4:]
print(list_np)

[['864K' 'Jul' 9 '12:46' '20220709_194629.WAV']
 ['0' 'Dec' 31 '1979' '20220709_194915.WAV']
 ['107M' 'Jul' 9 '12:59' '20220709_195611.WAV']
 ...
 ['856M' 'Jul' 12 '11:29' '20220712_180000.WAV']
 ['856M' 'Jul' 12 '11:59' '20220712_183000.WAV']
 ['386M' 'Jul' 12 '12:13' '20220712_190000.WAV']]


In [138]:
# We convert our new cleaner numpy array into a data frame with cleaner labels
# The time columns will have a year in some rows when the file is empty
# The file sizes also have M when it is mB (megabyte) and K when it is kB (kilobyte)
# There are no units for file sizes when the units are in bytes

df = pd.DataFrame(list_np, columns = ['File Size','Month','Date', 'Time', 'File Name'])

In [139]:
print(df)

     File Size Month Date   Time            File Name
0         864K   Jul    9  12:46  20220709_194629.WAV
1            0   Dec   31   1979  20220709_194915.WAV
2         107M   Jul    9  12:59  20220709_195611.WAV
3         856M   Jul    9  13:29  20220709_200000.WAV
4         856M   Jul    9  13:59  20220709_203000.WAV
...        ...   ...  ...    ...                  ...
1765      856M   Jul   12  10:29  20220712_170000.WAV
1766      856M   Jul   12  10:59  20220712_173000.WAV
1767      856M   Jul   12  11:29  20220712_180000.WAV
1768      856M   Jul   12  11:59  20220712_183000.WAV
1769      386M   Jul   12  12:13  20220712_190000.WAV

[1770 rows x 5 columns]


In [140]:
# We save our dataframe as a .csv using \t as a separator

df.to_csv('file_sizes.csv')