### Code editing for Python club 

Things I changed to this code
- I removed the transition() function and calculated mothIN events differently using Pandas.
- I used glob instead of os to load in data files.
- I used vectorized numpy array comprehension to apply the smooth() function to the data.
- The original code runs perfectly fine on my PC so I'm still not really sure what the memory issue was?

In [1]:
import pandas as pd
import os, matplotlib
import numpy as np
import matplotlib.pyplot as plt
from scipy import signal

# glob lets you batch-select files (it has many complex options if needed)
import glob 

# list comprehension library to split into mothIN and mothOUT
import more_itertools as mit

In [2]:
# Using the glob library for batch file processing is helpful 
# because it's easier to quickly edit code after renaming files
# you can also easily select csvs that are in specific folders

data_files = glob.glob('./*.csv')
print('Number of files is', len(data_files))

Number of files is 2


In [3]:
# It can be nice to declare all global variables in one place 
# because you don't have to search your code when changing parameters.

pixelNumThreshold = 500
binwidth = 50
outPath_fig = './figures/'

In [4]:
def smooth(Pixels, output_ints=10, std=3):
    win_gauss = signal.gaussian(output_ints, std)
    smooth_Pixels = signal.convolve(Pixels, win_gauss, mode='same', 
                                    method = 'direct') / win_gauss.sum()
    return(smooth_Pixels)

def process_data(data_files, verbose=True):
    for filename in data_files: 
        df = pd.read_csv(filename)

        # vectorized implementation on NumPy arrays (.values loads data as np)
        df['smoothed_pixelcount'] = smooth(df['NumPixel'].values)
        df['motion'] = df['smoothed_pixelcount'].values > pixelNumThreshold

        # remove all frames where there is no motion
        df = df[df['motion'] == True]

        # now I'm separating this into a list of lists of consecutive timestamps
        mothIN_timestamps = [list(x) for x in mit.consecutive_groups(df.index)]

        # The data should now look like this. Each int is a mothIN frame number.
        # [[1, 2, 3],  [20, 21, 22, 23, 24],  [1000, 1001]] ...etc
        # now I can do simple list comprehension to get the length of each mothIN event
        mothIN_lengths = [len(x) for x in mothIN_timestamps]
        
        # now your data should look like this. 
        # Each number is the number of frames in that mothIN event. 
        # [3, 5, 2]

        if len(mothIN_lengths) == 0:
            print(filename + " has no moth movement")
        else: 
            fig, ax = plt.subplots(1,1, figsize = (10,6))
            to_plot = mothIN_lengths
            plt.hist(to_plot, bins=range(min(to_plot), 
                                max(to_plot) + binwidth, binwidth))
            plt.xlabel("Total frames moth in view", fontsize = 20)
            plt.ylabel("Occurance of each bin", fontsize = 20)

            # get just the name out of the filename
            name = filename.split('/')[-1][:-4]
            plt.savefig(outPath_fig + name + '_histogram.png')

            # Matplotlib has a known memory leak when making multiple plots
            # Explicitly clear and close plots when you're done with them
            plt.clf()
            plt.close('all')

        if verbose == True:
            # if you're analyzing 10k files you don't want to print each time
            print('Done with ' + filename)

In [5]:
%%timeit
process_data(data_files, verbose=False)

5.63 s ± 265 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
# Run function without verbose argument to print progress statements
process_data(data_files)

Done with ./c-1_m4.csv
Done with ./c-2_m4.csv
