# Vorbereitung

In [None]:
%env KERAS_BACKEND=theano
%env THEANO_FLAGS=floatX=float32,device=cpu

In [None]:
%load_ext autoreload
%autoreload 1
%aimport bb_behavior
%aimport bb_behavior.plot
%aimport bb_behavior.tracking
%aimport bb_behavior.tracking.pipeline

import bb_behavior
import bb_behavior.plot
import bb_behavior.tracking
import bb_behavior.tracking.pipeline

import pandas as pd
import time
import datetime

import os
import glob

In [None]:
from tqdm import tqdm_notebook # progress bar

import math
import numpy as np
from bb_tracking.data.constants import DETKEY
#from bb_tracking.tracking import score_id_sim_v
from bb_tracking.tracking import distance_orientations_v, distance_positions_v

from bb_behavior.tracking.pipeline import detect_markers_in_video
from bb_behavior.tracking.pipeline import track_detections_dataframe
from bb_behavior.tracking.pipeline import display_tracking_results

# Run

In [None]:
from bb_behavior.tracking.pipeline import get_default_pipeline
default_pipeline = None
default_pipeline = get_default_pipeline(localizer_threshold="0.50")

In [None]:
# Hilfsfunktionen
def filename_to_datestring(filname):
    """
    filename can be path
    """
    return os.path.split(filname)[-1].split('.')[0].split('_')[1]

def datestring_to_filename(datestring, prefix="e00_"):
    return config["videos_dir"] + prefix + datestring + ".h264"

def string_to_timestamp(datestring):
    """ 
    params
        string: format 2018-08-19-01-08-13
    output
        unix timestamp (float)
    """
    
    return time.mktime(time.strptime(datestring, "%Y-%m-%d-%H-%M-%S"))

def timestamp_to_string(timestamp):
    return time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime(timestamp))

In [None]:
# get all videos between timestamp_in and timestamp_out
def get_videos_between(timestamp_in, timestamp_out, all_paths):
    """ returns all video between timestamp_in and timestamp out (inclusive) """
    
    mask = (all_paths['video'] >= timestamp_in) & (all_paths['video'] <= timestamp_out)
    return list(all_paths[mask]['video'])

def loadGTD(path):
    # read in the test data csv
    test_data = pd.read_csv(path)

    # convert the full filenames to string timestamps, and sort by timestamp_in
    test_data['video'] = test_data['video'].apply(lambda x: filename_to_datestring(x))
    test_data['video_start_time'] = test_data['timestamp_in'].apply(lambda x: filename_to_datestring(x))
    test_data['video_end_time'] = test_data['timestamp_out'].apply(lambda x: filename_to_datestring(x))
    test_data.sort_values(['video_start_time'], inplace=True)

    test_data.drop('timestamp_in', 1, inplace=True)
    test_data.drop('timestamp_out', 1, inplace=True)
    test_data.drop('video', 1, inplace=True)
    return test_data

In [None]:
tmp = loadGTD("bees_test.csv")
tmp

In [None]:
config = dict(tag_pixel_diameter=50,
              n_frames=None,
              confidence_filter_detections=0.08,
              confidence_filter_tracks=0.20,
              coordinate_scale=1.0,
              start_time=None,
              fps=10.0,
              cam_id=0,
              left_leaving_area = 0.3, # Prozente vom Bildschirmrand, zB. bei 1000px und 0.15 -> 0-150px
              right_leaving_area = 0.3,
              px_x_resolution_vid = 1944,
              px_y_resolution_vid = 388,
              videos_dir = "../videos/videos_tags/")


In [None]:
def detect_tracks(paths, save_to_csv = False):
    # Hier passiert das eigentliche Tracken und speichern der Ergebnisse:
    num_processed_videos = 0
    video_data = dict()
    frame_info = None
    detections = None
    bad_paths = []

    for path in tqdm_notebook(paths):
        start_time = config["start_time"]
        cam_id = config["cam_id"]
        try:
            num_processed_videos += 1

            frame_info, detections = detect_markers_in_video(path,
                                                          decoder_pipeline=default_pipeline,#pipeline=pipelines(),
                                                         tag_pixel_diameter=config["tag_pixel_diameter"],
                                                          n_frames=config["n_frames"],
                                                          fps=config["fps"],
                                                         progress="tqdm_notebook"
                                                )
            # Sonst würden keine Tracks erkannt werden -> Fehlermeldung
            if len(detections[detections['confidence']>=config["confidence_filter_detections"]]) == 0:
                continue
            tracks = track_detections_dataframe(detections,
                                                tracker="tracker.det_score_fun.frag_score_fun.dill",
                                                confidence_filter_detections=config["confidence_filter_detections"],
                                               confidence_filter_tracks=config["confidence_filter_tracks"],
                                                coordinate_scale=config["coordinate_scale"],
                                               )
            date_string = filename_to_datestring(path)
            tracks['video'] = date_string
            video_data[path] = (frame_info, detections, tracks)
        except ValueError as err: #tritt auf, wenn Video leer ist. In diesem Fall: überspringe video
            try:
                bad_paths.append(path)
                # wir arbeiten später nochmal mit paths, daher müssen das leere löschen, weil sonst
                # in video_data kein zugehöriger Value zu Key = file zu finden ist.
            except KeyError as err:
                continue
        except Exception as err:
            print(err)
            raise
        # only first vid: break

    for bad_path in bad_paths:
        paths.remove(bad_path)

    #video_data["file"][0] --> frame-info, [1] --> detections, [2] --> tracks
    for path in paths:
        display_tracking_results(path, video_data[path][0], video_data[path][1], video_data[path][2])

    tracks = [video_data[paths[x]][2] for x in range(len(paths))]
    tracks = pd.concat(tracks,ignore_index=True)
    tracks = tracks.drop(columns=["localizerSaliency", "beeID", "camID", "frameIdx"])

    if save_to_csv:
        with open("tracks.csv", "w") as f:
            tracks.to_csv(f)

    return tracks

## a: Create video path list from all videos in videos_dir

In [None]:
# Einen Iterable speichern, der alle Videos in einem Iterable zur Verfügung stellt
# Diesen Iterable können wir dann in der nächsten Zelle mit tqdm schön durchlaufen
base_directory = config["videos_dir"]
paths = [i for i in os.listdir(base_directory) if i.endswith(".h264")]
for i in range(len(paths)):
    paths[i] = base_directory + paths[i]

## b: Create video path list from test set

In [None]:
# read in the test data csv
test_data = loadGTD("bees_test.csv")

all_videos = []

all_paths = pd.DataFrame(glob.glob(os.path.join(config["videos_dir"], '*.h264')), columns=['video'])
all_paths['video'] = all_paths['video'].apply(lambda x: filename_to_datestring(x))
all_paths.sort_values(['video'])

# go through test_data and get all videos between timestamp_in and timestamp_out
for index, row in test_data.iterrows():
    all_videos += get_videos_between(row['video_start_time'],row['video_end_time'], all_paths)
    
all_videos = sorted(list(set(all_videos)))
paths = [datestring_to_filename(x) for x in all_videos]
del all_paths
del test_data

# Continue here

## Zwischenschritt: Merge all close Tracks of one bee

In [None]:
# RUN THIS IF TRACKS WHERE ALREADY CALCULATED AND SAVED TO AN .csv
# CSV EINLESEN UND SETZEN
tracks = pd.read_csv("all_tracks.csv", index_col=0)

In [None]:
tracks

In [None]:
def gather_tracks(tracks):
    """
    transform the tracks df to : bee_id, [list of positions (x,y)], [list of timestamps], timestamp_of video
    """

    #  transform tracks df to : bee_id, [list of positions (x,y)], [list of timestamps], timestamp_of video
    grouped = tracks.groupby(['bee_id','track_id','video'],  as_index=False)['xpos','ypos','timestamp','zrotation']

    tracks_ml = grouped.aggregate(lambda x: list(x))

    # we don't need track_id anymore
    tracks_ml = tracks_ml.drop('track_id', 1)

    # add a column: convert video name to timestamp
    tracks_ml['video_start_time'] = tracks_ml['video'].apply(lambda x: string_to_timestamp(x))
    
    # we don't need video anymore
    tracks_ml = tracks_ml.drop('video', 1)

    # because aggregated: now multiple timestamps per row: --> rename
    tracks_ml = tracks_ml.rename(columns={'timestamp': 'timestamps'})

    # calculate start time of track by adding timestamp of track (seconds since start of video)
    # to timestamp of video (date)
    tracks_ml['track_start_time'] = tracks_ml['video_start_time'] + tracks_ml['timestamps'].apply(lambda x: x[0])
    tracks_ml['track_end_time'] = tracks_ml['video_start_time'] + tracks_ml['timestamps'].apply(lambda x: x[-1])
    
    # convert back to string
    tracks_ml['video_start_time'] = tracks_ml['video_start_time'].apply(lambda x: timestamp_to_string(x))
    
    all_paths = pd.DataFrame(glob.glob(os.path.join(config["videos_dir"], '*.h264')), columns=['video'])
    all_paths['video'] = all_paths['video'].apply(lambda x: filename_to_datestring(x))
    all_paths.sort_values(['video'])
    # get end video
    tracks_ml['video_end_time'] =  tracks_ml['track_end_time'].apply(lambda x: timestamp_to_string(x))
    tracks_ml['video_end_time'] =  tracks_ml[['video_start_time','video_end_time']].apply(lambda x: get_videos_between(x[0],x[1],all_paths)[-1], axis=1)

    return tracks_ml

def merge_tracks(tracks_ml, verbose = False):
    """
    in: 
        tracks_ml: output from gather_tracks(tracks)
    out:
        same df as in, with merged rows
        [bee_id:Float, xpos:[Float], ypos:[Float], zrotation:[Float], timestamps:[Float], video_start_time: String,
        video_end_time: String, track_start_time:Float, track_end_time:Float]
        
    merge tracks of same bee where start and end timestamps are close together
    assume there can not be overlapping tracks
    """
    
    
    # 1. sort: bee_id, start_time
    tracks_ml = tracks_ml.sort_values(['bee_id', 'track_start_time'])

    
    # first convert to timestamp
    tracks_ml['video_start_time'] = tracks_ml['video_start_time'].apply(lambda x: string_to_timestamp(x))
    
    index = 0
    while(True):
        row = tracks_ml.iloc[index]
        next_row = tracks_ml.iloc[index+1]

        # if the tracks are from same bee and the start time of next row is closer then 10s -> merge the rows
        # merge rows means, next_row is deleted
        if (row['bee_id'] == next_row['bee_id']) and ((next_row['track_start_time'] - row['track_end_time']) < 10):
            
            if verbose:
                print(timestamp_to_string(row['video_start_time']), row['bee_id'], next_row['track_start_time'] - row['track_end_time'])
                print(next_row['video_start_time'] - row['video_start_time'])
                print(timestamp_to_string(row['track_start_time']),timestamp_to_string(row['track_end_time']),timestamp_to_string(row['video_start_time']),row['video_end_time'])
                print(row['timestamps'])
                print(row.name)
                print(timestamp_to_string(next_row['track_start_time']))
                print('----------------------------------')
           
            # update the timestamps of nextrow
            t = next_row['video_start_time'] - row['video_start_time']
            timestamps = list( np.array(next_row['timestamps']) + t)

            # merge xpos, ypos, timestamps lists
            tracks_ml.at[row.name,'xpos'] = row['xpos']+next_row['xpos']
            tracks_ml.at[row.name,'ypos'] = row['ypos']+next_row['ypos']
            tracks_ml.at[row.name,'zrotation'] = row['zrotation']+next_row['zrotation']
            tracks_ml.at[row.name,'timestamps'] = row['timestamps']+timestamps

            # update end_time
            tracks_ml.at[row.name,'track_end_time'] = next_row['track_end_time']
            tracks_ml.at[row.name,'video_end_time'] = next_row['video_end_time']

            # delete the merged row (next_row)
            tracks_ml = tracks_ml.drop(next_row.name).copy()
            
            assert(len(tracks_ml.at[row.name,'xpos']) == len(tracks_ml.at[row.name,'xpos']) == len(tracks_ml.at[row.name,'xpos']) == len(tracks_ml.at[row.name,'zrotation']))

        else:
            index += 1
        
        if index == len(tracks_ml) - 1:
            break
            
    # convert back to string
    tracks_ml['video_start_time'] = tracks_ml['video_start_time'].apply(lambda x: timestamp_to_string(x))
            
    return tracks_ml

In [None]:
tracks_ml = gather_tracks(tracks)
tracks_ml = merge_tracks(tracks_ml)
tracks_ml = tracks_ml.sort_values(['video_start_time'])

# Accuracy of Tracking

In [None]:
# read in the test data csv (the manually labeled data)
test_data = loadGTD("bees_test.csv")
test_data.drop_duplicates(inplace=True)

In [None]:
test_data_len = len(test_data)

# get the total number of tracks where the start_video and bee_id is correct
correct_in_video_start = len(pd.merge(tracks_ml,test_data[['bee_id','in_direction','video_start_time']], on=['bee_id','video_start_time'], how='inner'))

# get the total number of tracks where the end video and bee_id is correct
correct_in_video_end = len(pd.merge(tracks_ml,test_data[['bee_id','out_direction','video_end_time']], on=['bee_id','video_end_time'], how='inner'))

# get all tracks where bee_id, video_start_time and video_end_time are the same
total_correct = len(pd.merge(tracks_ml,test_data[['bee_id','in_direction','out_direction','video_end_time','video_start_time']], how='inner',on=['bee_id','video_start_time','video_end_time']))

print('bee_id and video_start_time correct: %.2f (%d/%d)' % (correct_in_video_start/test_data_len,correct_in_video_start,test_data_len))
print('bee_id and video_end_time correct: %.2f (%d/%d)' % (correct_in_video_end/test_data_len,correct_in_video_end,test_data_len))
print('bee_ids and both videos correct: %.2f (%d/%d)' % (total_correct/test_data_len,total_correct,test_data_len))

# Add labels to tracks data

In [None]:
# get all tracks where bee_id, video_start_time and video_end_time are the same
tracks_ml = pd.merge(tracks_ml,test_data,how='inner',on=['bee_id','video_start_time','video_end_time'])

In [None]:
print(len(tracks_ml))
tracks_ml.head()

In [None]:
# split into training and test data
from sklearn.model_selection import train_test_split

tracks_ml_train, tracks_ml_test = train_test_split(tracks_ml,test_size=0.2, random_state=42)

# Baseline-Algorithmen

## Algorithmus 1: Baseline - Areas as Decider

In [None]:
from math import pi
""" Vorgehen:
Für jedes Bienen-Track Paar
- Prüfe, wo sich die Biene beim ersten erkannten Erscheinen aufhält
    - Ordne die Position in left, right oder middle ein
- Prüfe, wo sich die Biene beim letzten erkannten Erscheinen aufhält
    - Ordne die Position in left, right oder middle ein
Wenn sich Biene in der Mitte befindet, starte neue Routine, die links oder rechts zuordnet
Daraus kann nun abgeleitet werden, wo die Biene reingekommen ist und wo sie rausgegangen ist.
"""
# param: tracks_ml, s. oben
def baseline_alg_classify_bee(tracks_ml):
    def get_direction(xpos, zpos):
        #Helpers
        def is_left(xpos):
            return xpos <= config["left_leaving_area"]*config["px_x_resolution_vid"]
        def is_right(xpos):
            return xpos >= config["px_x_resolution_vid"] - config["right_leaving_area"]*config["px_x_resolution_vid"]
        # Routine, wenn Biene in der Mitte
        # Betrachte zRotation, das ist die Richtung in die die Biene guckt (in Bogenmaß)
        # gebe diese Richtung aus
        def middle_classifier(zRotation):
            if abs(zRotation) > pi/2:
                return "left"
            else:
                return "right"
            
        # Eintrittsseite festlegen
        if is_left(xpos):
            return "left"
        elif is_right(xpos):
            return "right"
        else:
            return middle_classifier(zpos)
        
    pred_in_direction = len(tracks_ml) * [None]
    pred_out_direction = len(tracks_ml) * [None]
    i = 0
    for index, row in tracks_ml.iterrows():
        pred_in_direction[i] = get_direction(row['xpos'][0], row['zrotation'][0])
        pred_out_direction[i]= get_direction(row['xpos'][-1], row['zrotation'][-1])
        i += 1
    
    #remove those with no 
    return pd.DataFrame({"pred_in_direction":pred_in_direction, "pred_out_direction":pred_out_direction})

## Algorithmus 2: Baseline - zpos as decider

In [None]:
from math import pi
""" Vorgehen:
Unterschied: Hier wird nicht auf die Position abgestellt, sondern ausschließlich auf die
Richtung, in die die Biene schaut
"""
# param: tracks_ml, s. oben
def baseline_2_alg_classify_bee(tracks_ml):
    def get_in_direction(zpos):
        # Betrachte zRotation, das ist die Richtung in die die Biene guckt (in Bogenmaß)
        # gebe diese Richtung aus
        if abs(zpos) > abs(pi/2):
            return "right"
        else:
            return "left"
    def get_out_direction(zpos):
        # Betrachte zRotation, das ist die Richtung in die die Biene guckt (in Bogenmaß)
        # gebe diese Richtung aus
        if abs(zpos) > abs(pi/2):
            return "left"
        else:
            return "right"
        
    pred_in_direction = len(tracks_ml) * [None]
    pred_out_direction = len(tracks_ml) * [None]
    i = 0
    for index, row in tracks_ml.iterrows():
        pred_in_direction[i] = get_in_direction(row['zrotation'][0])
        pred_out_direction[i]= get_out_direction(row['zrotation'][-1])
        i += 1
    
    return pd.DataFrame({"pred_in_direction":pred_in_direction, "pred_out_direction":pred_out_direction})

# Accuracy and Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

def conbine_in_out(y_in, y_out):
    """ convert in_direction, out_direction to ll, lr, rr, rl """
    
    in_label = y_in.copy()
    in_label[y_in == 'left'] = 'L'
    in_label[y_in == 'right'] = 'R'
    
    out_label = y_out.copy()
    out_label[y_out == 'left'] = 'L'
    out_label[y_out == 'right'] = 'R'

    return in_label + out_label
    

def classifier_metrics(y, ypred, verbose = False):
    """ 
    y.columns = ['in_direction','out_direction']
    y_pred.columns = ['pred_in_direction','pred_out_direction']
    
    returns: 
        (in_accuracy, out_accuracy, total_accuracy)
    """
        
    y_combined = conbine_in_out(y['in_direction'], y['out_direction'])
    y_pred_combined = conbine_in_out(ypred['pred_in_direction'], ypred['pred_out_direction'])

    in_accuracy = accuracy_score(y['in_direction'], ypred['pred_in_direction'])
    out_accuracy = accuracy_score(y['out_direction'], ypred['pred_out_direction'])
    total_accuracy =  accuracy_score(y_combined, y_pred_combined)

    if verbose:
        print('Confusion Matrix In-Direction:')
        print(pd.DataFrame(confusion_matrix(y['in_direction'], ypred['pred_in_direction']), columns=['L','R'], index=['L','R']))
        print('Accuracy In-Direction:',in_accuracy)
        print('\n')
        print('Confusion Matrix Out-Direction:')
        print(pd.DataFrame(confusion_matrix(y['out_direction'], ypred['pred_out_direction']), columns=['L','R'], index=['L','R']))
        print('Accuracy Out-Direction:',accuracy_score(y['out_direction'], ypred['pred_out_direction']))
        print('\n')
        print('Confusion Matrix Combined:')
        print(pd.DataFrame(confusion_matrix(y_combined,y_pred_combined, labels=['LL', 'LR', 'RL', 'RR']), columns=['LL', 'LR', 'RL', 'RR'], index=['LL', 'LR', 'RL', 'RR']))
        print('Total Accuracy:', total_accuracy)

    return (in_accuracy, out_accuracy, total_accuracy)

In [None]:
results = baseline_alg_classify_bee(tracks_ml)
classifier_metrics(tracks_ml[['in_direction','out_direction']], results[['pred_in_direction','pred_out_direction']], verbose=True)

In [None]:
results = baseline_2_alg_classify_bee(tracks_ml_test)

In [None]:
classifier_metrics(tracks_ml_test[['in_direction','out_direction']], results[['pred_in_direction','pred_out_direction']], verbose=True)

# Algorithmus 3: Machine Learning

## Vorbereiten Lernvideos, Prüfvideos

### Optional: Tracks um 180° drehen; Tracks horizontal invertieren; Tracks vertikal invertieren (Funktioniert, abhängig von der Wichtigkeit der Zeit als Feature, vielleicht nicht!)

In [None]:
from itertools import chain, combinations
from math import pi
import random

def inc_year(video_time_string):
    video_string_parts = video_time_string.split('-')
    video_string_parts[0] = str(int(video_string_parts[0]) + 1)
    return '-'.join(video_string_parts)

def opposite(direction):
            if direction == 'left':
                return 'right'
            else:
                return 'left'

"""
invertiert einen Track auf der vertikalen Achse. Bei outer ist die Achse die Achse des Videos.
"""
def invertVertical_outer(tracks_ml):
    inverted_tracks = tracks_ml.copy(deep = True)
    center = config['px_y_resolution_vid']/2 #default
    for index, row in tracks_ml.iterrows():
        inverted_tracks.at[index, 'ypos'] = [abs(y - center) for y in row['ypos']]
        inverted_tracks.at[index, 'zrotation'] = [z * -1 for z in row['zrotation']]
        inverted_tracks.at[index, 'video_start_time'] = inc_year(row['video_start_time'])
        inverted_tracks.at[index, 'track_start_time'] = string_to_timestamp(row['video_start_time']) + row['timestamps'][0]
        inverted_tracks.at[index, 'track_end_time'] = string_to_timestamp(row['video_start_time']) + row['timestamps'][-1]
        inverted_tracks.at[index, 'video_end_time'] = inc_year(row['video_end_time'])
    return inverted_tracks

"""
invertiert einen Track auf der vertikalen Achse. Bei inner ist die Achse die Achse des Trackbereichs.
"""
def invertVertical_inner(tracks_ml):
    inverted_tracks = tracks_ml.copy(deep = True)
    for index, row in tracks_ml.iterrows():
        center = max(row['ypos']) - (min(row['ypos'])/2)
        inverted_tracks.at[index, 'ypos'] = [abs(y - center) for y in row['ypos']]
        inverted_tracks.at[index, 'zrotation'] = [z * -1 for z in row['zrotation']]
        inverted_tracks.at[index, 'video_start_time'] = inc_year(row['video_start_time'])
        inverted_tracks.at[index, 'track_start_time'] = string_to_timestamp(row['video_start_time']) + row['timestamps'][0]
        inverted_tracks.at[index, 'track_end_time'] = string_to_timestamp(row['video_start_time']) + row['timestamps'][-1]
        inverted_tracks.at[index, 'video_end_time'] = inc_year(row['video_end_time'])
    return inverted_tracks

def invertHorizontal(tracks_ml):
    inverted_tracks = tracks_ml.copy(deep = True)
    for index, row in tracks_ml.iterrows():
        center = max(row['xpos']) - (min(row['xpos'])/2)
        inverted_tracks.at[index, 'xpos'] = [abs(x - center) for x in row['xpos']]
        
        def zrotation_shifter(z):
            cmp = pi/2
            if z >= cmp:
                return z - cmp
            else:
                return z + cmp

        inverted_tracks.at[index, 'zrotation'] = [zrotation_shifter(z) for z in row['zrotation']]
        inverted_tracks.at[index, 'video_start_time'] = inc_year(row['video_start_time'])
        inverted_tracks.at[index, 'track_start_time'] = string_to_timestamp(row['video_start_time']) + row['timestamps'][0]
        inverted_tracks.at[index, 'track_end_time'] = string_to_timestamp(row['video_start_time']) + row['timestamps'][-1]
        inverted_tracks.at[index, 'video_end_time'] = inc_year(row['video_end_time'])
        inverted_tracks.at[index, 'in_direction'] = opposite(row['in_direction'])
        inverted_tracks.at[index, 'out_direction'] = opposite(row['out_direction'])
    return inverted_tracks

"""
randomisiert y Koordinaten
"""
def randomize_y_coordinates(tracks_ml):
    threshold_y = 0.02 # Looking at the original videos, 2% equate to about 7px on the y-axis
    inverted_tracks = tracks_ml.copy(deep = True)
    for index, row in tracks_ml.iterrows():
        lower_bound = -(config['px_y_resolution_vid'] * threshold_y)
        upper_bound = config['px_y_resolution_vid'] * threshold_y
        inverted_tracks.at[index, 'ypos'] = [y + random.uniform(lower_bound, upper_bound) for y in row['ypos']]
        inverted_tracks.at[index, 'video_start_time'] = inc_year(row['video_start_time'])
        inverted_tracks.at[index, 'track_start_time'] = string_to_timestamp(row['video_start_time']) + row['timestamps'][0]
        inverted_tracks.at[index, 'track_end_time'] = string_to_timestamp(row['video_start_time']) + row['timestamps'][-1]
        inverted_tracks.at[index, 'video_end_time'] = inc_year(row['video_end_time'])
    return inverted_tracks
    
"""
randomisiert x Koordinaten
"""
def randomize_x_coordinates(tracks_ml):
    threshold_x = 0.01 # equates to 19 px
    inverted_tracks = tracks_ml.copy(deep = True)
    for index, row in tracks_ml.iterrows():
        lower_bound = - config['px_x_resolution_vid'] * threshold_x
        upper_bound = config['px_x_resolution_vid'] * threshold_x
        inverted_tracks.at[index, 'xpos'] = [x + random.uniform(lower_bound, upper_bound) for x in row['xpos']]
        inverted_tracks.at[index, 'video_start_time'] = inc_year(row['video_start_time'])
        inverted_tracks.at[index, 'track_start_time'] = string_to_timestamp(row['video_start_time']) + row['timestamps'][0]
        inverted_tracks.at[index, 'track_end_time'] = string_to_timestamp(row['video_start_time']) + row['timestamps'][-1]
        inverted_tracks.at[index, 'video_end_time'] = inc_year(row['video_end_time'])
    return inverted_tracks

#
# Biene geht andersherum -> koordinaten vertauschen, labels vertauschen
#
def reverse_path(tracks_ml):
    reversed_tracks = tracks_ml.copy(deep = True)
    for index, row in tracks_ml.iterrows():
        reversed_tracks.at[index, 'xpos'] = list(reversed(row['xpos']))
        reversed_tracks.at[index, 'ypos'] = list(reversed(row['ypos']))
        reversed_tracks.at[index, 'video_start_time'] = inc_year(row['video_start_time'])
        reversed_tracks.at[index, 'track_start_time'] = string_to_timestamp(row['video_start_time']) + row['timestamps'][0]
        reversed_tracks.at[index, 'track_end_time'] = string_to_timestamp(row['video_start_time']) + row['timestamps'][-1]
        reversed_tracks.at[index, 'video_end_time'] = inc_year(row['video_end_time'])
        reversed_tracks.at[index, 'in_direction'] = opposite(row['in_direction'])
        reversed_tracks.at[index, 'out_direction'] = opposite(row['out_direction'])
    return reversed_tracks

"""
Using powerset on a list of functions to try all possibilities
param: DataFrame containing the standardized columns
return: new DataFrame, containing also the original
"""
def augment_data(tracks_ml):
    # taken from https://docs.python.org/3/library/itertools.html#recipes
    # and slightly modified
    def powerset(iterable):
        "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
        s = list(iterable)
        return list(chain.from_iterable(combinations(s, r) for r in range(len(s)+1)))
    functions = [invertVertical_outer, invertVertical_inner, invertHorizontal,
                 randomize_y_coordinates, randomize_x_coordinates, reverse_path]
    powerset = powerset(functions)
    length_powerset = 2**(len(functions))
    result = tracks_ml
    for i in range(length_powerset):
        newDataFrame = tracks_ml.copy(deep = True)
        for f in powerset[i]:
            newDataFrame = f(newDataFrame)
        result = result.append(newDataFrame, ignore_index = True)
    return result


# Get Features from data

In [None]:
def normalize(x, min_x, max_x):
    """ normalize x to 0..1 """
    return (x-min_x)/(max_x-min_x)

def get_time_from_timestamp(timestamp):
    """ return time of day in hours (float) (range 0-23.99) """
    time_struct = time.localtime(timestamp)
    return time_struct.tm_hour+normalize(time_struct.tm_min, 0, 60)

In [None]:
def euclidean_distance(p1, p2):
    return np.sqrt(np.sum((p1-p2)**2))

def get_speed(p1, p2, t1, t2):
    d = euclidean_distance(p1, p2)
    t = t2-t1
    return d/t

def get_start_speed(xpos, ypos, timestamps):
    
    assert(len(xpos) == len(ypos) == len(timestamps))
    
    if len(xpos) < 2:
        return 0
    p1 = np.array([xpos[0], ypos[0]])
    p2 = np.array([xpos[1], ypos[1]])
    
    return get_speed(p1,p2,timestamps[0],timestamps[1])

def get_end_speed(xpos, ypos, timestamps):
    
    assert(len(xpos) == len(ypos) == len(timestamps))
    
    if len(xpos) < 2:
        return 0
    
    p1 = np.array([xpos[-2], ypos[-2]])
    p2 = np.array([xpos[-1], ypos[-1]])
    return get_speed(p1,p2,timestamps[-2],timestamps[-1])

def get_avg_speed(xpos, ypos, timestamps):
    """ returns a list of speeds between the points (x_pos, y_pos) """
    if len(xpos) <= 1:
        return 0
    
    s = []
    for i in range(len(xpos)-1):
        p1 = np.array([xpos[i], ypos[i]])
        p2 = np.array([xpos[i+1], ypos[i+1]])
        
        speed = get_speed(p1, p2, timestamps[i], timestamps[i+1])
        s.append(speed)
        
    s = np.array(s)
    return np.mean(s)

def get_track_distance(xpos, ypos):
    points = np.array([xpos, ypos]).T
    
    dists = []
    for i in range(points.shape[0]-1):
        d = euclidean_distance(points[i],points[i+1])
        dists.append(d)
    return np.sum(dists)

In [None]:
from sklearn.preprocessing import normalize as sklearn_normalize

def get_features(X, include=None, exclude=None, normalize=False):
    """ 
    if include != None then only include features in include array, ignore exclude. default: include all
    if exclude != None then inlcude all except exclude array. default: exclude None
    
    if normalize = True, normalizes all features to range [0,1]
    
    'duration',
    'day_time',
    'num_detections',
    'start_xpos',
    'end_xpos',
    'avg_xpos',
    'start_ypos',
    'end_ypos',
    'start_z_rotation',
    'end_z_rotation',
    'avg_zrotation',
    'start_speed',
    'end_speed',
    'avg_speed',
    'track_length'
    """
    
    # duration, start_pos, end_pos, start_z_rotation, end_z_rotation, time_of_day, num_points_in_track, avg_speed_of_bee, start_speed, end_speed
    tracks = X.copy()
    tracks_features = X.copy()
    
    tracks_features['duration'] = tracks['track_end_time'] - tracks['track_start_time']
    tracks_features['start_xpos'] = tracks['xpos'].apply(lambda x: x[0])
    tracks_features['end_xpos'] = tracks['xpos'].apply(lambda x: x[-1])
    tracks_features['avg_xpos'] = tracks['xpos'].apply(lambda x: np.mean(x))
    tracks_features['start_ypos'] = tracks['ypos'].apply(lambda x: x[0]) # do we need?
    tracks_features['end_ypos'] = tracks['ypos'].apply(lambda x: x[-1]) # do we need?


    tracks_features['normalized_zrotation'] =  tracks['zrotation'].apply(lambda x: np.array(x) + math.pi) # we normalized because pi -pi
    tracks_features['start_z_rotation'] = tracks_features['normalized_zrotation'].apply(lambda x: x[0])
    tracks_features['end_z_rotation'] = tracks_features['normalized_zrotation'].apply(lambda x: x[-1])
    tracks_features['avg_zrotation'] =  tracks_features['normalized_zrotation'].apply(lambda x: np.mean(x))

    tracks_features['day_time'] = (tracks['track_start_time'] + tracks['track_end_time']) / 2
    tracks_features['day_time'] = tracks_features['day_time'].apply(lambda x: get_time_from_timestamp(x))

    tracks_features['num_detections'] = tracks['xpos'].apply(lambda x: len(x))

    # speed
    tracks_features['start_speed'] = tracks[['xpos','ypos','timestamps']].apply(lambda x: get_start_speed(x[0],x[1],x[2]), axis=1)
    tracks_features['end_speed'] = tracks[['xpos','ypos','timestamps']].apply(lambda x: get_end_speed(x[0],x[1],x[2]), axis=1)
    tracks_features['avg_speed'] = tracks[['xpos','ypos','timestamps']].apply(lambda x: get_avg_speed(x[0],x[1],x[2]), axis=1)

    # total distance traveled
    tracks_features['track_length'] = tracks[['xpos','ypos']].apply(lambda x: get_track_distance(x[0],x[1]), axis=1)
    
    #must_include = ['bee_id','video_start_time','video_end_time','track_start_time','track_end_time','in_direction','out_direction']
    all_features = ['duration','day_time','num_detections','start_xpos','end_xpos','avg_xpos','start_ypos','end_ypos','start_z_rotation','end_z_rotation','avg_zrotation','start_speed','end_speed','avg_speed', 'track_length']
    if include is None and exclude is None:
        columns = all_features
    if include is not None:
        columns = include
    if exclude is not None:
        columns = [x for x in all_features if x not in exclude]
        
    tracks_features = tracks_features[columns]
    
    if normalize is True:
        
        tracks_features = pd.DataFrame(sklearn_normalize(tracks_features, axis=0), columns=tracks_features.columns)
        
    return tracks_features

In [None]:
X = get_features(tracks_ml, include=['start_xpos','end_xpos'], normalize=True)

In [None]:
X = get_features(tracks_ml)
X.head()

# Feature selection

In [None]:
# select features based on https://scikit-learn.org/stable/modules/feature_selection.html
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif

X = get_features(tracks_ml, normalize=True)
y = tracks_ml['in_direction']

selector = SelectKBest(chi2, k=5)
selector.fit(X, y)

# Get columns to keep
cols = selector.get_support(indices=True)

print("Important Columns chi2:",list(X.columns[cols]))

selector = SelectKBest(f_classif, k=5)
selector.fit(X, y)
cols = selector.get_support(indices=True)

print("Important Columns f_classif:",list(X.columns[cols]))

selector = SelectKBest(mutual_info_classif, k=5)
selector.fit(X, y)
cols = selector.get_support(indices=True)

print("Important Columns mutual info:",list(X.columns[cols]))

In [None]:
class logisic_regression_clf():

    def __init__(self, k_features=1, feature_selection_method="mutual_info_classif", normalized=False, verbose=True):
        
        self.k_features = k_features
        self.feature_selection_method = feature_selection_method
        self.normalized = normalized
        self.verbose = verbose

    def fit(self, train_x, train_y):

        self.features_in_dir = train_x.columns
        self.features_out_dir = train_x.columns

        if self.k_features is not None:
            selector_in_dir = SelectKBest(
                mutual_info_classif, k=self.k_features) if self.feature_selection_method == "mutual_info_classif" else SelectKBest(f_classif, k=self.k_features)
            selector_in_dir.fit(train_x, train_y['in_direction'])
            self.features_in_dir = train_x.columns[selector_in_dir.get_support(
                indices=True)]
            if self.verbose:
                print('Selected Features for in_direction:', self.features_in_dir)

            selector_out_dir = SelectKBest(
                mutual_info_classif, k=self.k_features) if self.feature_selection_method == "mutual_info_classif" else SelectKBest(f_classif, k=self.k_features)
            selector_out_dir.fit(train_x, train_y['out_direction'])
            self.features_out_dir = train_x.columns[selector_out_dir.get_support(
                indices=True)]
            if self.verbose:
                print('Selected Features for out_direction:', self.features_out_dir)

        train_x_in = train_x[self.features_in_dir]
        train_x_out = train_x[self.features_out_dir]

        # in_dir
        self.clf_in = LogisticRegression()
        self.clf_in.fit(train_x_in, train_y['in_direction'])

        # out_dir
        self.clf_out = LogisticRegression()
        self.clf_out.fit(train_x_out, train_y['out_direction'])

    def test(self, test_x, test_y):
        
        pred_y = self.predict(test_x)
        return classifier_metrics(test_y, pred_y, verbose=self.verbose)
        
    def predict(self, x):
        
        test_x_in = x[self.features_in_dir]
        pred_y_in = self.clf_in.predict(test_x_in)
        
        # test
        test_x_out = x[self.features_out_dir]
        pred_y_out = self.clf_out.predict(test_x_out)
        
        pred_y = pd.DataFrame(columns=['pred_in_direction','pred_out_direction'])
        pred_y['pred_in_direction'] = pred_y_in
        pred_y['pred_out_direction'] = pred_y_out
        
        return pred_y
        

In [None]:
data = tracks_ml.copy()

n_bootstraps = 10
test_size = 0.5
k_features = 1
feature_selection_method = "mutual_info_classif" # "f_classif"
normalized = False

# bootstrap

accuracies = []
for counter in range(n_bootstraps): 
    sys.stdout.write('.')
    sys.stdout.flush()
    
    train_x, test_x = train_test_split(data, test_size = test_size, random_state=random.randint(0,2**32-1))

    # increase training set
    train_x = augment_data(train_x)
    
    # get training labels
    train_y = train_x[['in_direction','out_direction']]
    
    # get training features
    train_x = get_features(train_x)
    
    # get test labels
    test_y = test_x[['in_direction','out_direction']]
    
    # get test features
    test_x = get_features(test_x)
    
    # train
    clf = logisic_regression_clf(k_features = k_features, feature_selection_method=feature_selection_method, normalized=normalized, verbose=False)
    clf.fit(train_x, train_y)
    
    # test
    accuracies.append(clf.test(test_x, test_y))

In [None]:
np.mean(np.array(accuracies), axis=0)

In [None]:
from sklearn.linear_model import LogisticRegression
import sys

data = tracks_ml.copy()

n_bootstraps = 10
test_size = 0.5
k_features = 1
feature_selection_method = "mutual_info_classif" # "f_classif"
normalized = False

accuracies = []
for counter in range(n_bootstraps): 
    
    sys.stdout.write('.')
    sys.stdout.flush()

    train_x, test_x = train_test_split(data, test_size = test_size, random_state=random.randint(0,2**32-1))

    # increase training set
    train_x = augment_data(train_x)
    train_y = train_x[['in_direction','out_direction']]
    train_x = get_features(train_x)
    
    test_y = test_x[['in_direction','out_direction']]
    test_x = get_features(test_x)

    features_in_dir = train_x.columns  
    features_out_dir = train_x.columns
    
    if k_features is not None:
        selector_in_dir = SelectKBest(mutual_info_classif, k=k_features) if feature_selection_method == "mutual_info_classif" else SelectKBest(f_classif, k=k_features)
        selector_in_dir.fit(train_x, train_y['in_direction'])
        features_in_dir = train_x.columns[selector_in_dir.get_support(indices=True)]

        selector_out_dir = SelectKBest(mutual_info_classif, k=k_features) if feature_selection_method == "mutual_info_classif" else SelectKBest(f_classif, k=k_features)
        selector_out_dir.fit(train_x, train_y['out_direction'])
        features_out_dir = train_x.columns[selector_out_dir.get_support(indices=True)]

    train_x_in = train_x[features_in_dir]
    train_x_out = train_x[features_out_dir]

    ## in_dir
    
    # train
    clf = LogisticRegression()
    clf.fit(train_x_in, train_y['in_direction'])

    # test
    test_x_in = test_x[features_in_dir]
    pred_y_in = clf.predict(test_x_in)
    
    ## out_dir
    
    # train
    clf = LogisticRegression()
    clf.fit(train_x_out, train_y['out_direction'])

    # test
    test_x_out = test_x[features_out_dir]
    pred_y_out = clf.predict(test_x_out)
    
    # get accuracy
    test_y = test_y[['in_direction', 'out_direction']]
    pred_y = pd.DataFrame(columns=['pred_in_direction','pred_out_direction'])
    pred_y['pred_in_direction'] = pred_y_in
    pred_y['pred_out_direction'] = pred_y_out
    
    accuracies.append(classifier_metrics(test_y, pred_y))

In [None]:
np.mean(np.array(accuracies), axis=0)

In [None]:
classifier_metrics(test_y, pred_y, print_output=True)

## Visualize the Features

## 3.1 Lernen Logistische Regression

In [None]:
from sklearn.model_selection import train_test_split
from math import pi
import random
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# zeug von tschopo
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif

def train_log_regression_adv(data):
    train_x, test_x = train_test_split(data, test_size = 0.2, random_state=random.randint(0,2**32-1))
    # increase training set
    train_x = augment_data(train_x)
    train_y = train_x['in_direction']
    train_x = get_features(train_x, normalize=False)
    
    # zeug von tschopo
    selector = SelectKBest(mutual_info_classif, k=5)
    selector.fit(train_x, train_y)
    # Get columns to keep
    cols = selector.get_support(indices=True)
    col_names = train_x.columns[cols]
    train_x = train_x[col_names]
    
    # train on training set
    clf = LogisticRegression()
    clf.fit(train_x, train_y)

    # create test set
    test_y = test_x['in_direction']
    test_x = get_features(test_x)
    test_x = test_x[col_names]
    
    pred_y = clf.predict(test_x)
    # calculate accuracy
    #print("Genauigkeit: %f" % accuracy_score(test_y, clf.predict(test_x)))
    return float(accuracy_score(test_y, pred_y))

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# taken from: https://stackoverflow.com/questions/783897/truncating-floats-in-python
def truncate(f, n):
    '''Truncates/pads a float f to n decimal places (after .) without rounding'''
    s = '{}'.format(f)
    if 'e' in s or 'E' in s:
        return '{0:.{1}f}'.format(f, n)
    i, p, d = s.partition('.')
    return '.'.join([i, (d+'0'*n)[:n]])

x = list()
y = list()

for i in range(1,11,1):
    y.append(train_log_regression_adv(tracks_ml)*100)
    #ohne random_state gab es teilw. gleiche Ergebnisse auf dem getesteten Macbook
    
x = range(1,len(y)+1,1)
plt.plot(x, y,'bo')
avg = sum(y)/float(len(y))
plt.axhline(y=avg, color='r', linestyle='-')
plt.ylabel("Accuracy in %")
plt.yticks(np.arange((min(y)//10)*10-10, 110, 10.0))
plt.xticks([])
plt.annotate('avg. = '+ str(truncate(avg,1)), xy=(2/3*len(x),avg+2))
plt.savefig('accuracy_basic_features.jpg', dpi=150)

In [None]:
plt.show()

## 3.1 Alg: Logistische Regression

## 3.2 Lernen Decision Tree

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
import sys

train_x, test_x = train_test_split(tracks_ml, test_size = 0.5, random_state=random.randint(0,2**32-1))

In [None]:
train_x = augment_data(train_x)

In [None]:
train_y = train_x['in_direction']
train_x = get_features(train_x, normalize=False)

In [None]:
clf = ExtraTreesClassifier(n_estimators=100)
clf = RandomForestClassifier(n_estimators=100, )
clf = clf.fit(train_x, train_y)

In [None]:
clf.feature_importances_ 

In [None]:
# most important features (ordered)
train_x.columns[np.argsort(clf.feature_importances_, )[::-1]]

In [None]:
test_y = test_x['in_direction']
test_x = get_features(test_x)
pred_y = clf.predict(test_x)
accuracy_score(test_y, pred_y)

## 3.2 Alg Decision Tree

# Vergleichsfunktion Algorithmus mit Labels

In [None]:
def getAccuracy():
    """ 
    Input: bees_test.csv, MachineLearnedData.csv
    Output: "result" as DataFrame with an overview of compareable bees adn their correct/wrong correctness
    
    Read in two csv file and store values in DataFrame's. Merge DataFrame's to get exact correctness of 
    total-, in- and out_direction of bees regarding their time_in and time_out relation. 
    """

    # delete existing result.csv file to prevent errors
    if (os.path.isfile('./result.csv')):
        os.remove("result.csv")

    # store data in dataFrame
    gtd = loadGTD('bees_test.csv')
    mld = pd.read_csv('MachineLearnedData.csv')

    # change column name to merge dataFrames
    gtd.rename(columns = {'video_start_time':'time_in', 'video_end_time':'time_out'}, inplace=True)

    # delete dublications
    mld = mld.drop_duplicates(['bee_id', 'time_in', 'time_out'], keep='first')
    gtd = gtd.drop_duplicates(['bee_id', 'time_in', 'time_out'], keep='first')

    # change DT from float to int to have the same DT like gtd
    mld.bee_id = mld.bee_id.astype('int32')

    # merge to get 100% correct matches
    mergeTotalCorrect = pd.merge(gtd, mld, on=['bee_id', 'time_in', 'time_out', 'in_direction', 'out_direction'], how='inner')
    #merge to get in_direction correct
    mergeInDirection = pd.merge(gtd, mld, on=['bee_id', 'time_in', 'in_direction'], how='inner')
    #merge to get out_direction correct
    mergeOutDirection = pd.merge(gtd, mld, on=['bee_id', 'time_out', 'out_direction'], how='inner')
    # merge to get comparable bees to get all found bees
    mergeableBees = pd.merge(gtd, mld, on=['bee_id', 'time_in', 'time_out'], how='inner')

    # iterate mergeableBees for visualization
    result = pd.DataFrame()
    result['bee_id'] = mergeableBees['bee_id'] 
    result['in_equals'] = mergeableBees['in_direction_x'] == mergeableBees['in_direction_y']
    result['out_equals'] = mergeableBees['out_direction_x'] == mergeableBees['out_direction_y']
    #result['movement'] = result['in_equals'].bool() == True and result['out_equals'].bool() == False

    # store to result.csv
    result.to_csv('result.csv', encoding='utf-8', index=False)

    # calculate statitics
    inCorrect = round((result['in_equals'].sum()/len(result) * 100), 2)
    outCorrect = round((result['out_equals'].sum()/len(result) * 100), 2)
    allCorrect = round(((result['in_equals'] == result['out_equals']).sum()/len(result) * 100), 2)
    aveCorrect = round(((allCorrect + inCorrect + outCorrect) / 3), 2)

    # print statistics
    try:
        print(len(gtd), 'entries in GroundTruthData.', len(mergeableBees), 'entries compareable in MachineLearnedData: ', round((len(mergeableBees)/len(gtd))*100, 2), '%' )
        print(allCorrect, "% total correctness.")
        print(inCorrect, "% Ingoing correctness.")
        print(outCorrect, "% Outgoing correctness.")
        print(aveCorrect, "% Average correctness.")
        print("Accuracy of ",round((len(mergeableBees)/len(gtd))*100*(allCorrect/100),2),"%.")
    except ZeroDivisionError:
        print("Nothing to compare!")

    # end compare algorithm
    print('\nCheck result.csv file or hit \'result\' for more details.')
    print('Use DataFrames for deeper information: \n\'mergeTotalCorrect\',\'mergeInCorrect\',\'mergeOutCorrect\',\'mergeableBees\' ')


In [None]:
getAccuracy()