# LSTM

In [None]:
from pandas import DataFrame
from sklearn.impute import SimpleImputer
from numpy import nan, arange, array
import seaborn as sns
from matplotlib import pyplot as plt
import tensorflow as tf
from numpy import ndarray, mean, float32, ones, array, concatenate, unique
from pandas import Series, concat
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import StratifiedGroupKFold
from typing import Tuple
from glob import glob

from src.labels import iterate_valid_labels, find_valid_segments, get_labels_from_video,\
    get_labels_as_dataframe, value_to_name
from src.sampling.landmarks import get_landmark_df_path
from src.common.helpers import read_dataframe
from src.hpe_dnn.helpers import binarize_labels, unbinarize_labels

loaded labels


In [2]:
all_features = read_dataframe("data/df/rnn/cvs_features.pkl")

feature_placeholder = ones(shape=(all_features.shape[0]))
groups = all_features["group"]
labels = all_features["label"]

In [None]:
sgkf1 = StratifiedGroupKFold(n_splits=10, shuffle=False)
for i, (train_temp, test_index) in enumerate(sgkf1.split(feature_placeholder, labels, groups)):
    sgkf2 = StratifiedGroupKFold(n_splits=10, shuffle=False)
    train_index, val_index = list(sgkf2.split(train_temp, labels[train_temp], groups[train_temp]))[i]
    train_index = train_temp[train_index]
    val_index = train_temp[val_index]

    train_groups = unique(groups[train_index])
    val_groups = unique(groups[val_index])
    test_groups = unique(groups[test_index])

    print(f"Fold {i}:")
    print(f"  Train: groups={train_groups}")
    print(f"  Val:  groups={val_groups}")
    print(f"  Test:  groups={test_groups}")

Fold 0:
  Train: groups=[  0   1   2   3   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  25  26  27  28  29  30  31  32  33  34  37  39  40
  41  42  43  44  46  47  48  50  51  52  53  54  55  56  57  58  59  61
  62  63  64  65  67  68  69  70  71  72  73  74  75  76  77  80  81  82
  83  87  88  89  90  91  93  95  96  97  98  99 101 102 104 106 107 108
 109 110 111 113 114 115 116 117 118 119 120 121 122 125 126 127 128 129
 131 133 134 135 136 137 138 139 142 144 146 148 149 151 153 154 155 156
 157 158 159 160 161 163 164 165 167 168 169 170 172 173 174 175 176 177
 178 180 181 182 183 184 186 187 188 189 190 192 193 194 195 196 197 198
 199 200 201 202 204 205 206 209 210 211 212 213 214 216 217 219 220 221
 222 223 224 225 226 227 228 229]
  Val:  groups=[  4  24  35  49  60  79  84  86  92 103 123 140 145 152 171 191 203 208
 215 218]
  Test:  groups=[ 36  38  45  66  78  85  94 100 105 112 124 130 132 141 143 147 150 162
 166 179 185 207]
Fold 1:

In [None]:
def count_label_frames(groups: list) -> dict:
    label_frame_count = {label: 0 for label in iterate_valid_labels()}
    for group in groups:
        segment_df = all_features.query(f"group == {group}")
        labels = segment_df["label"]
        label_count = labels.value_counts()
        
        for key in label_frame_count.keys():
            if key in label_count.keys():
                label_frame_count[key] += label_count[key]
    return label_frame_count

In [None]:
train_count = count_label_frames(train_groups)
val_count = count_label_frames(val_groups)
test_count = count_label_frames(test_groups)

print("Data splits (train/val/test):")
for key in train_count.keys():
    total = train_count[key] + val_count[key] + test_count[key]
    print(f"{key}: {train_count[key] / total:.1%} / {val_count[key] / total:.1%} / {test_count[key] / total:.1%}")

print()
print("Totals (train/val/test):")
for key in train_count.keys():
    total = train_count[key] + val_count[key] + test_count[key]
    print(f"{key}: {train_count[key]} / {val_count[key]} / {test_count[key]}")

Data splits (train/val/test):
NONE: 81.0% / 9.0% / 10.0%
FOOT_SWAP: 81.7% / 8.3% / 10.1%
OUTSIDE_FLAG: 81.3% / 8.7% / 10.0%
BACK_FLAG: 79.1% / 11.4% / 9.5%
INSIDE_FLAG: 80.6% / 9.2% / 10.2%
DROP_KNEE: 81.6% / 8.3% / 10.1%
CROSS_MIDLINE: 84.5% / 7.0% / 8.5%

Totals (train/val/test):
NONE: 30608 / 3405 / 3781
FOOT_SWAP: 1233 / 125 / 152
OUTSIDE_FLAG: 4226 / 454 / 518
BACK_FLAG: 2891 / 415 / 349
INSIDE_FLAG: 2875 / 327 / 363
DROP_KNEE: 3847 / 390 / 478
CROSS_MIDLINE: 2137 / 178 / 215


In [25]:
def impute_features(features: DataFrame) -> DataFrame:
    output = features.copy()
    imp = SimpleImputer(missing_values=nan, strategy='constant', fill_value=0, 
        keep_empty_features=True)
    output = DataFrame(imp.fit_transform(output), columns=output.keys())
    return output

def take_groups(df: DataFrame, groups: list) -> DataFrame:
    filtered = list(map(lambda group: df.query(f"group == {group}"), groups))
    return concat(filtered, axis=0, ignore_index=True)

def normalize_features(features: DataFrame, group: Series, train_group: list) -> DataFrame:
    temp = concat([features.copy(), group], axis=1)
    train_df = take_groups(temp, train_group)
    train_df = train_df.drop("group", axis=1)
    train_mean = train_df.mean()
    train_std = train_df.std()

    return (features - train_mean) / train_std

In [102]:
from src.labels import name_to_value

input_name = Series(list(iterate_valid_labels()))

output = binarize_labels(input_name)

print(output.values)

input_hat = unbinarize_labels(output)
print(all(input_hat == input_name))


[[0 0 0 0 0 1 0]
 [0 0 0 1 0 0 0]
 [0 0 0 0 0 0 1]
 [1 0 0 0 0 0 0]
 [0 0 0 0 1 0 0]
 [0 0 1 0 0 0 0]
 [0 1 0 0 0 0 0]]
True


In [35]:
class WindowGenerator():

    @property
    def train_df(self) -> DataFrame:
        return take_groups(self.data, self.train_groups)

    @property
    def val_df(self) -> DataFrame:
        return take_groups(self.data, self.val_groups)
    
    @property
    def test_df(self) -> DataFrame:
        return take_groups(self.data, self.test_groups)

    def __init__(self, input_width: int, label_width: int, shift: int, data: DataFrame,
            train_groups: list, val_groups: list, test_groups: list):
        df = data.copy()
        video = df.pop("video")
        frame_num = df.pop("frame_num")
        group = df.pop("group")

        # Transform labels to model ouputs
        labels_str = df.pop('label')
        labels_bin = binarize_labels(labels_str)
        labels = DataFrame(data=labels_bin, columns=list(iterate_valid_labels()))
        self.label_columns = list(iterate_valid_labels())

        # Remove missing values and normalize features
        df = impute_features(df)
        df = normalize_features(df, group, train_groups)

        # Store full dataset and splits
        self.data = concat([video, frame_num, group, df, labels], axis=1)
        self.train_groups = train_groups
        self.val_groups = val_groups
        self.test_groups = test_groups

        # Work out the label column indices.
        self.column_indices = {name: i for i, name in enumerate(self.data.columns)}
        self.label_columns_indices = {name: i for i, name in enumerate(self.label_columns)}

        # Work out the window parameters.
        self.input_width = input_width
        self.label_width = label_width
        self.shift = shift

        self.total_window_size = input_width + shift

        self.input_slice = slice(0, input_width)
        self.input_indices = arange(self.total_window_size)[self.input_slice]

        self.label_start = self.total_window_size - self.label_width
        self.labels_slice = slice(self.label_start, None)
        self.label_indices = arange(self.total_window_size)[self.labels_slice]

    def __count_label_frames(self, df: DataFrame) -> dict:
        label_frame_count = {label: 0 for label in iterate_valid_labels()}
        labels = df["label"]
        label_count = labels.value_counts()
        
        for key in label_frame_count.keys():
            if key in label_count.keys():
                label_frame_count[key] += label_count[key]

        return label_frame_count

    def inspect_fold_split(self):
        train_df = self.train_df
        val_df = self.val_df
        test_df = self.test_df
        print('All shapes are: (frames, features)')
        print(f"Training data: {train_df.shape}")
        print(f"Val data: {val_df.shape}")
        print(f"Test data: {test_df.shape}")

        train_count = self.__count_label_frames(train_df)
        val_count = self.__count_label_frames(val_df)
        test_count = self.__count_label_frames(test_df)

        print("Data splits (train/val/test):")
        for key in train_count.keys():
            total = train_count[key] + val_count[key] + test_count[key]
            print(f"{key}: {train_count[key] / total:.1%} / {val_count[key] / total:.1%} / {test_count[key] / total:.1%}")

        print()
        print("Totals (train/val/test):")
        for key in train_count.keys():
            total = train_count[key] + val_count[key] + test_count[key]
            print(f"{key}: {train_count[key]} / {val_count[key]} / {test_count[key]}")

    def split_window(self, batch: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
        inputs = batch[:, self.input_slice, :]
        output = batch[:, self.labels_slice, :]
        output = tf.stack([output[:, :, self.column_indices[name]] for name in self.label_columns],
            axis=-1)
        
        # Slicing doesn't preserve static shape information, so set the shapes
        # manually. This way the `tf.data.Datasets` are easier to inspect.
        inputs.set_shape([None, self.input_width, None])
        output.set_shape([None, self.label_width, None])

        return inputs, output

    def get_example(self):
        # Stack three slices, the length of the total window.
        example_batch = tf.stack([array(self.data[:self.total_window_size]),
            array(self.data[100:100+self.total_window_size]),
            array(self.data[200:200+self.total_window_size])])

        example_inputs, example_ouputs = self.split_window(example_batch)

        print('All shapes are: (batch, time, features)')
        print(f'Window shape: {example_batch.shape}')
        print(f'Inputs shape: {example_inputs.shape}')
        print(f'Labels shape: {example_ouputs.shape}')

        return example_inputs, example_ouputs

    def plot(self, model=None, plot_col='NOSE_x', max_subplots=3):
        inputs, labels = self.get_example()
        plt.figure(figsize=(12, 8))
        plot_col_index = self.column_indices[plot_col]
        frame_num_index = self.column_indices['frame_num']
        max_n = min(max_subplots, len(inputs))
        for n in range(max_n):
            plt.subplot(max_n, 1, n+1)
            plt.ylabel(f'{plot_col} [act]')
            feature_values = inputs[n, :, plot_col_index]
            x_axis = inputs[n, :, frame_num_index]
            
            plt.xlim((x_axis[0], x_axis[-1]+1))
            plt.plot(x_axis, feature_values,
                label='Inputs', marker='.', zorder=-10)

            label_col_index = plot_col_index
            if label_col_index is None:
                continue
            
            label_name = unbinarize_labels(array(labels[n, :, :]))
            label_x_position = x_axis[-1] + 0.2
            plt.text(label_x_position, mean(feature_values), label_name, c='#2ca02c', 
                label="Label")
            
            # plt.scatter(self.label_indices, labels[n, :, label_col_index],
            # 	edgecolors='k', label='Labels', c='#2ca02c', s=64)
            # if model is not None:
            # 	predictions = model(inputs)
            # 	plt.scatter(self.label_indices, predictions[n, :, label_col_index],
            # 		marker='X', edgecolors='k', label='Predictions',
            # 		c='#ff7f0e', s=64)

            if n == 0:
                plt.legend()

        plt.xlabel('Frame number')

    def make_dataset(self, data):
        data = array(data, dtype=float32)
        ds = tf.keras.utils.timeseries_dataset_from_array(
            data=data,
            targets=None,
            sequence_length=self.total_window_size,
            sequence_stride=1,
            shuffle=False,
            batch_size=1,)

        ds = ds.map(self.split_window)

        return ds

    def __repr__(self):
        return '\n'.join([
            f'Total window size: {self.total_window_size}',
            f'Input indices: {self.input_indices}',
            f'Label indices: {self.label_indices}'])

In [36]:
w1 = WindowGenerator(input_width=5, label_width=1, shift=1, data=all_features, 
    train_groups=train_groups, val_groups=val_groups, test_groups=test_groups)
print(w1)

Total window size: 6
Input indices: [0 1 2 3 4]
Label indices: [5]


In [38]:
train_df = w1.train_df
label_frame_count = {label: sum(train_df[label]) for label in iterate_valid_labels()}

In [37]:
w1.inspect_fold_split()

All shapes are: (frames, features)
Training data: (47875, 124)
Val data: (5296, 124)
Test data: (5858, 124)


KeyError: 'label'

In [None]:
ds = w1.make_dataset(w1.data)

In [None]:
array(ds)

In [None]:
w1.plot()

In [None]:
plt.figure(figsize=(16, 6))
features_melted = temp.melt(var_name="Column", value_name="Raw")
ax = sns.violinplot(x="Column", y="Raw", data=features_melted)
_ = ax.set_xticklabels(temp.keys(), rotation=90)

In [None]:
import tensorflow as tf

from src.common.model import ClassificationModel

In [None]:
class LSTM(ClassificationModel):
    
    pass