In [None]:
from matplotlib import pyplot as plt

from sykepic.analysis.dataframe import frequency_df, filter_df

In [None]:
# frequency_df() returns  a Pandas dataframe with class frequencies for each sample
# It takes several arguments

# Required

# Root directory of all prediction CSV-files
prediction_directory = 'predictions'

# Optional

# Confidence threshold repsesents the minimum probability a required for a valid prediction.
# It can be single float value (e.g. 0.7), or a path to an exteranl file with class specific thresholds
# The latter file must have a default value if some classes are left empty or not included in the file.
# See this example file for formatting.
confidence_threshold = 'confidence_threshold.txt'

# Start date and time to filter samples (CSV-files) by. 
# It must follow the formatting of `date_format` (see below).
# The reason for filtering the CSV-files before reading everything to a dataframe is because
# prediction_directory can have a lot of data, which might not be reasonable to read to memory at once.
start = '2018-07-01 00:00'

# Same as previous but for the end.
end = '2018-08-31 23:59'

# Allow only those samples that are withing this hour window (date doesn't matter).
# The formatting has to match the this: "[hour]:[minute]-[hour]:[minute]"
# e.g. for samples taken around mid-day, you could set this to "11:30-12:30"
hour_window = None

# The date and time formatting used for `start` and `end`.
# The default formatting below is this: "[year]-[month]-[day] [hour]:[minute]"
date_format='%Y-%m-%d %H:%M'

In [None]:
# Create the dataframe
df = frequency_df(prediction_directory, confidence_threshold, start, end, hour_window, date_format)

# And see what it looks like
df.head()

In [None]:
# It's probably not a good idea to show all classes in the plot.
# You can filter df easily with the helper function: `filter_df()`

# It takes as arguments the dataframe to filter and one of the following:

# 1. prediction 
# One class name (string) or a list of class names (list of strings):
prediction = ['Aphanizomenon_flosaquae', 'Oscillatoriales']
df2 = filter_df(df, prediction=prediction)

# 2. top
# For the `top` most frequent classes
df3 = filter_df(df, top=2)

In [None]:
# Lets plot df2
plt.style.use('dark_background')
df2.plot(figsize=(10, 6))
plt.ylabel('Frequency')

In [None]:
# And df3
plt.style.use('dark_background')
df3.plot(figsize=(10, 6))
plt.ylabel('Frequency')