In [1]:
import os, cv2, glob
import numpy as np
import pandas as pd

## Explore data duration

In [2]:
# Define the dataset folder
dir_dataset = "C:/Users/36394/Study/GWU/PHD in Biomedical Engineer/Research/FOS/Autism_dataset"
dir_raw = os.path.join(dir_dataset, 'Data')


In [3]:
# Find all the video path
g = os.walk(dir_raw)
paths_video = []
for path,dir_list,file_list in g:
    for file_name in file_list:
        paths_video.append(os.path.join(path, file_name))
paths_video = [i for i in paths_video if i[-4:]!= 'xlsx']
paths_video = [i for i in paths_video if i[-4:]!= '.zip']

In [4]:
# Find the good video (8 < duration < 15)
videos = {'name': [], 'duration': []}
for path_video in paths_video:
    cap = cv2.VideoCapture(path_video)
    if cap.isOpened():
        rate = cap.get(5)
        FrameNumber = cap.get(7)
        duration = (FrameNumber/rate) / 60
        name = path_video.split('Data\\')[1]
        videos['name'].append(name)
        videos['duration'].append(duration)
df_videos = pd.DataFrame(videos)
df_videos = df_videos.sort_values('duration')
df_long_videos = df_videos.loc[df_videos['duration'] > 15]
df_short_videos = df_videos.loc[df_videos['duration'] < 8]
df_good_videos = df_videos.loc[(8 < df_videos['duration']) & (df_videos['duration'] < 15)]

In [5]:
# Save the video not good for duration
df_short_videos.to_csv('short_video.csv', index=False)
df_long_videos.to_csv('long_video.csv', index=False)
df_good_videos.to_csv('good_video.csv', index_label=False)

## data preprocessing

In [8]:
# Process each good videos
for num_video, name in enumerate(df_good_videos.name.tolist()):
    path_video = os.path.join(dir_raw, name)
    path_label = glob.glob(os.path.join(os.path.dirname(path_video), '*.xlsx'))[0]
    names_sheet = pd.ExcelFile(path_label).sheet_names
    if 'Alone' in os.path.basename(path_video):
        df_label = pd.read_excel(path_label, sheet_name="alone" if "alone" in names_sheet else "Alone")
    elif 'Instruction' in os.path.basename(path_video):
        df_label = pd.read_excel(path_label, sheet_name="instruction" if "instruction" in names_sheet else "Instruction")
    elif 'Playtime' in os.path.basename(path_video):
        df_label = pd.read_excel(path_label, sheet_name="playtime" if "playtime" in names_sheet else "Playtime")
    else:
        print("Bad data here, need to fixed: print {}".format(path_video))
        break

    # Find the row that contains all the table head.
    index_head = 0
    index_tail = 0
    for index, col in df_label.iteritems():
        index_head_tail = col[col == 'Time'].index
        if len(index_head_tail) > 0:
            index_head = index_head_tail[0]
            index_tail = index_head_tail[1]
            break

    # Check head and tail error:
    if index_head + index_tail < 20:
        print('Something wrong for the head and tail index.')
        break

    # Change the head of the dataframe
    df_label.columns = df_label.iloc[index_head].values

    # Crop the df_label to keep valuable information
    ind_col_head, ind_col_tail = np.where(df_label.columns == 'Time')[0]
    df_label = df_label.iloc[index_head+1:index_tail]
    df_label = df_label.iloc[:, ind_col_head:ind_col_tail]

    # Change the duplicate header names (eg: Int to Int_parent and Int_child)
    index_Int = np.where(df_label.columns == 'Int')[0]
    df_label.columns.values[index_Int[0]] = 'Int_parent'
    df_label.columns.values[index_Int[1]] = 'Int_child'
    index_Int = np.where(df_label.columns == 'Aff')[0]
    df_label.columns.values[index_Int[0]] = 'Aff_parent'
    df_label.columns.values[index_Int[1]] = 'Aff_child'

    # Preprocess the video:
    cap = cv2.VideoCapture(path_video)
    if cap.isOpened():
        fps = cap.get(5)
        FrameNumber = cap.get(7)
        frame_width = int(cap.get(3))
        frame_height = int(cap.get(4))

        size = (frame_width, frame_height)
        num_frame = 0
        # info_label = df_label.iterrows()
    else:
        break

    for index, row in df_label.iterrows():
        # initial set up for interval
        start_time = row['Time']
        frame_start = start_time.minute * fps + start_time.hour * 60 * fps
        frame_end = 10 * fps + frame_start
        temp_list_frames = []
        # Crop the corresponding video
        while num_frame <= frame_end:
            success,frame = cap.read()
            num_frame += 1
            if not success:
                break
            temp_list_frames.append(frame)
        # Check the labels of this video and save it in the corresponding folder
        labels = [i for i in row.index if row[i]==1]
        for label in labels:
            fourcc = cv2.VideoWriter_fourcc(*'MP4V')
            dir_save_video = os.path.dirname(path_video.replace('Data', 'Data_processed/{}'.format(label)))
            if not os.path.exists(dir_save_video):
                os.makedirs(dir_save_video)
            name_save_video = '{}_{}.mp4'.format(num_video, len(os.listdir(dir_save_video)))
            path_save_video = os.path.join(dir_save_video, name_save_video)
            videoWriter = cv2.VideoWriter(path_save_video,fourcc,fps,size)
            for frame in temp_list_frames:
                videoWriter.write(frame)
            videoWriter.release()
    


Bad data here, need to fixed: print C:/Users/36394/Study/GWU/PHD in Biomedical Engineer/Research/FOS/Autism_dataset\Data\5th\LJH\5th_LJH_Following Intructions.mp4


## Process the short video

In [16]:
np.unique([os.path.basename(i) for i in df_short_videos.name])

array(['Following Instructions1.MP4', 'Following Instructions2.MP4',
       'Home_Following Instructions_Clean Up.MP4',
       'Home_Following Instructions_Clean Up.mp4',
       'Home_Following Instructions_Freely.MP4',
       'Home_Following Instructions_Freely.mpg',
       'Home_Following Instructions_List.MP4', 'Home_Playing Alone.mp4',
       'Home_Playing Alone1.MP4', 'Home_Playing Alone2.MP4',
       'Home_Playtime.mp4', 'Home_Playtime_Casual Interaction.MP4',
       'Home_Playtime_New Toys.MP4',
       'Home_Playtime_Physical Activity.MP4',
       'Home_Playtime_Physical Activity.mpg',
       'Hospital_Following Instructions_Clean Up.MP4',
       'Hospital_Following Instructions_Freely.MP4',
       'Hospital_Following Instructions_List.MP4',
       'Hospital_Playing Alone 1.MP4', 'Hospital_Playing Alone 2.MP4',
       'Hospital_Playtime_Casual Interaction.MP4',
       'Hospital_Playtime_New Toys.MP4', 'Hospital_Playtime_New toys.MP4',
       'Hospital_Playtime_Physical Activity 

In [12]:
df_short_videos.name

Unnamed: 0,name,duration
92,20th\BAM\Hospital_Playtime_New Toys.MP4,2.013122
86,20th\BAM\Hospital_Following Instructions_Clean...,2.044264
120,21th\KSJ\Hospital_Playtime_New Toys.MP4,2.046489
157,23th\JBH\Hospital_Following Instructions_Clean...,2.064284
115,21th\KSJ\Hospital_Following Instructions_Clean...,2.074851
...,...,...
180,5th\LCD\Home_Playtime.mp4,5.373702
123,21th\LJH\Home_Following Instructions_Freely.MP4,5.761867
87,20th\BAM\Hospital_Following Instructions_Freel...,6.006556
80,19th\KDH\Home_Playing Alone1.MP4,7.558940


In [14]:
[i for i in df_short_videos.name if 'Alone' in i]

['5th\\LCD\\Home_Playing Alone.mp4',
 '19th\\KDH\\Home_Playing Alone2.MP4',
 '20th\\BAM\\Hospital_Playing Alone 1.MP4',
 '20th\\BAM\\Hospital_Playing Alone 2.MP4',
 '19th\\KDH\\Home_Playing Alone1.MP4']

In [20]:
np.sum(df_videos.duration)

1567.6955324296296