In [1]:
import numpy as np
import random as rn
import pandas as pd
import pickle
from video_process_utils import *

In [2]:
with open('data/processed/all_videos_dict.pickle', 'rb') as handle:
    all_videos = pickle.load(handle)

In [3]:
alldata_processed =\
    pd.read_csv("./data/processed/alldata_processed.csv")
alldata_processed = alldata_processed.groupby(['videoid'],as_index=False).head(1)

In [4]:
COLS_USED = [LANK,RANK,LKNE,RKNE,LHIP,RHIP,LBTO,RBTO] #only these columns factor in the calculation of missing data

In [5]:
result = []
vid_length = 124
for videoid, raw_video in all_videos.items():
    if len(raw_video.shape) == 2 and len(raw_video) >= vid_length:
        raw_video = drop_confidence_cols(raw_video[:500,:].copy())
        #for each column, compute the % of values missing
        #then, take the max of those missing values
        pct_missing = max_pct_nan_or_zero_given_cols(raw_video,COLS_USED)
        
        n_segments = 0
        start_idx = 0
        for i in range(start_idx,500-vid_length,31):
            raw_video_chunk = raw_video[i:i+vid_length,:]
            pct_missing_chunk =\
                max_pct_nan_or_zero_given_cols(raw_video_chunk,COLS_USED)
            if pct_missing_chunk <= 0.25 and len(raw_video_chunk) == vid_length:
                n_segments += 1
        
        result.append([videoid,pct_missing,n_segments])
result = np.array(result)

  return np.max(np.sum(mask,axis=0))*1.0/len(A)


In [6]:
df = pd.DataFrame(np.array(result),columns=['videoid','pct_missing','n_segments'])
df = df[(df['pct_missing'] <= 0.25)]
df = df.merge(right=alldata_processed[['videoid','Patient_ID']],on=['videoid'],how='inner')

In [7]:
#for each patient_ID, assign a random number
np.random.seed(1)
rand_df = pd.DataFrame(alldata_processed['Patient_ID'].unique(),columns=['Patient_ID'])
rand_df["random_num"] = np.random.uniform(0,1,len(rand_df))

In [8]:
df = df.merge(right=rand_df,on=['Patient_ID'],how='left')

In [9]:
#80/10/10 split (so roughly 200 in each of validation/test)
def assign_split(x):
    if x <= 0.80:
        return "train"
    elif x > 0.80 and x <= 0.9:
        return "validation"
    else:
        return "test"
df['dataset'] = df['random_num'].apply(assign_split)

In [10]:
assert(df.groupby('Patient_ID')['dataset'].nunique().max()==1) #verify that videos from each Patient_ID appear in only one split

In [11]:
df.to_csv('./data/processed/train_test_valid_id_split.csv',index=False)

In [12]:
df['dataset'].value_counts()

train         1768
test           232
validation     212
Name: dataset, dtype: int64

In [13]:
df.groupby('dataset')['n_segments'].sum()

dataset
test           2690.0
train         20426.0
validation     2469.0
Name: n_segments, dtype: float64

In [14]:
np.sum(df['n_segments'])

25585.0

In [15]:
df.groupby('dataset')['n_segments'].mean()

dataset
test          11.594828
train         11.553167
validation    11.646226
Name: n_segments, dtype: float64