In [1]:
import os, cv2, glob
import numpy as np
import pandas as pd

## Explore data duration

In [2]:
# Define the dataset folder
dir_dataset = "C:/Users/36394/Study/GWU/PHD in Biomedical Engineer/Research/FOS/Autism_dataset"
dir_raw = os.path.join(dir_dataset, 'Data')


In [3]:
# Find all the video paths
g = os.walk(dir_raw)
paths_video = []
for path,dir_list,file_list in g:
    for file_name in file_list:
        if file_name[-4:] != 'xlsx' and file_name[-4:]!= '.zip':
            paths_video.append(os.path.join(path, file_name))

In [4]:
# Find the good video (8 < duration < 15)
videos = {'name': [], 'duration': []}
for path_video in paths_video:
    cap = cv2.VideoCapture(path_video)
    if cap.isOpened():
        rate = cap.get(5)
        FrameNumber = cap.get(7)
        duration = (FrameNumber/rate) / 60
        name = path_video.split('Data\\')[1]
        videos['name'].append(name)
        videos['duration'].append(duration)
df_videos = pd.DataFrame(videos)
df_videos = df_videos.sort_values('duration')
df_long_videos = df_videos.loc[df_videos['duration'] > 15]
df_short_videos = df_videos.loc[df_videos['duration'] < 8]
df_good_videos = df_videos.loc[(8 < df_videos['duration']) & (df_videos['duration'] < 15)]

In [None]:
# Save the video not good for duration
df_short_videos.to_csv('short_video.csv', index=False)
df_long_videos.to_csv('long_video.csv', index=False)
df_good_videos.to_csv('good_video.csv', index_label=False)

In [None]:
print(len(df_good_videos))
print(len(df_long_videos))
print(len(df_short_videos))

107
1
108


In [None]:
subjects_short_videos = np.unique([name.split('\\')[0]+'/'+name.split('\\')[1] for name in df_short_videos.name.to_list()])
dict_subject_short_video = {}
for subject in subjects_short_videos:
    dir_subject = os.path.join(dir_raw, subject)
    list_videos = glob.glob(os.path.join(dir_subject, '*[!xlsx]'))
    dict_subject_short_video[subject] = len(list_videos)

## data preprocessing

In [7]:
import re
# Find the good videos (duration < 15) and without Korean characters in the name.
videos = {'name': [], 'duration': []}
for path_video in paths_video:
    cap = cv2.VideoCapture(path_video)
    if cap.isOpened():
        rate = cap.get(5)
        FrameNumber = cap.get(7)
        duration = (FrameNumber/rate) / 60
        name = path_video.split('Data\\')[1]
        videos['name'].append(name)
        videos['duration'].append(duration)
df_videos = pd.DataFrame(videos)
# Drop all the videos with duration longer than 15 minutes.
print(len(df_videos))
df_videos = df_videos.loc[df_videos['duration'] < 15]
df_videos = df_videos.sort_values('duration')
print(len(df_videos))
# Drop all the videos with Korean characters in the name.
print(df_videos.loc[df_videos['name'].str.contains('[ㄱ-ㅎㅏ-ㅣ가-힣]') == True])
df_videos = df_videos.loc[df_videos['name'].str.contains('[ㄱ-ㅎㅏ-ㅣ가-힣]') == False]
print(len(df_videos))
# Sort the index
df_videos = df_videos.reset_index(drop=True)

216
215
                                  name  duration
207  5th\LJH\5th_LJH_Playtime_다른사람.mp4  1.394150
215      6th\SSH\6th_SSH_에서 떨어진 영상.mp4  2.674892
9    12th\LDK\12th_LDK_에서 떨어져나온 영상.mp4  4.644639
212


In [None]:
# Process each good video
dict_processed = {'path': [], 'labels': []}
for num_video, name in enumerate(df_videos.name.tolist()):
    path_video = os.path.join(dir_raw, name)
    path_label = glob.glob(os.path.join(os.path.dirname(path_video), '*.xlsx'))[0]
    names_sheet = pd.ExcelFile(path_label).sheet_names
    if 'Alone' in os.path.basename(path_video):
        df_label = pd.read_excel(path_label, sheet_name="alone" if "alone" in names_sheet else "Alone")
    elif 'Instruction' in os.path.basename(path_video):
        df_label = pd.read_excel(path_label, sheet_name="instruction" if "instruction" in names_sheet else "Instruction")
    elif 'Playtime' in os.path.basename(path_video):
        df_label = pd.read_excel(path_label, sheet_name="playtime" if "playtime" in names_sheet else "Playtime")
    else:
        print("Bad data here, need to fixed: print {}".format(path_video))
        break

    # Find the row that contains all the table head.
    index_head = 0
    index_tail = 0
    for index, col in df_label.iteritems():
        index_head_tail = col[col == 'Time'].index
        if len(index_head_tail) > 0:
            index_head = index_head_tail[0]
            index_tail = index_head_tail[1]
            break

    # Check head and tail error:
    if index_head + index_tail < 20:
        print('Something wrong for the head and tail index.')
        break

    # Change the head of the dataframe
    df_label.columns = df_label.iloc[index_head].values

    # Crop the df_label to keep valuable information
    ind_col_head, ind_col_tail = np.where(df_label.columns == 'Time')[0]
    df_label = df_label.iloc[index_head+1:index_tail]
    df_label = df_label.iloc[:, ind_col_head:ind_col_tail]

    # Change the duplicate header names (eg: Int to Int_parent and Int_child)
    index_Int = np.where(df_label.columns == 'Int')[0]
    df_label.columns.values[index_Int[0]] = 'Int_parent'
    df_label.columns.values[index_Int[1]] = 'Int_child'
    index_Int = np.where(df_label.columns == 'Aff')[0]
    df_label.columns.values[index_Int[0]] = 'Aff_parent'
    df_label.columns.values[index_Int[1]] = 'Aff_child'
    
    # Check if the label is empty
    if df_label.drop(columns=['Time']).fillna(0).applymap(lambda x: 0 if x != 1 else x).values.sum() < 10:
        print("There is no label for this video: {}".format(path_video))
        continue


    # Crop the label to match the video
    match = re.search(r'(alone|playtime|instructions)\s*(\d+)', name, re.IGNORECASE)
    if re.search(r"interaction", name, re.IGNORECASE):
        df_label = df_label[0:30] # First 5 minutes
    elif re.search(r"activity", name, re.IGNORECASE):
        df_label = df_label[30:48] # 5 - 8 minutes
    elif re.search(r"toy", name, re.IGNORECASE):
        df_label = df_label[48:60] # 8 - 10 minutes
    elif re.search(r"clean", name, re.IGNORECASE):
        df_label = df_label[0:12] # First 2 minutes
    elif re.search(r"list", name, re.IGNORECASE):
        df_label = df_label[12:30] # 2 - 5 minutes
    elif re.search(r"freely", name, re.IGNORECASE):
        df_label = df_label[30:60] # 5 - 10 minutes
    elif match: # Crop the label to match the video if the video is like "Alone 1" or "Playtime 2"
        num_video = int(match.group(2))
        duration = df_videos.loc[df_videos['name'] == name, 'duration'].values[0]
        if duration < 4:
            print("The video {} is too short, skip it.".format(name))
            continue
        if num_video == 1:
            df_label = df_label[0:30] # First 5 minutes
        elif num_video == 2:
            df_label = df_label[30:60] # 5 - 10 minutes
        else:
            print("Something wrong with the video name: {}, break!!!!!!!!".format(name))
            break
    elif name in df_good_videos.name.tolist(): # Good video's label doesn't need to be cropped
        pass
    else:
        print("Something wrong with the video name: {}, can't be processed".format(name))
        continue
    
    # Preprocess the video:
    cap = cv2.VideoCapture(path_video)
    if cap.isOpened():
        fps = cap.get(5)
        FrameNumber = cap.get(7)
        frame_width = int(cap.get(3))
        frame_height = int(cap.get(4))

        size = (frame_width, frame_height)
        num_frame = 0
    else:
        break
    
    
    # Cut the video into 10 seconds interval and save them.
    for index, row in df_label.iterrows():
        # initial set up for interval
        start_time = row['Time']
        frame_start = start_time.minute * fps + start_time.hour * 60 * fps
        frame_end = 10 * fps + frame_start
        temp_list_frames = []
        # Crop the corresponding video
        while num_frame <= frame_end:
            success,frame = cap.read()
            num_frame += 1
            if not success:
                continue
            temp_list_frames.append(frame)
        # Check the labels of this video and save it in the corresponding folder
        labels = [i for i in row.index if row[i]==1]
        fourcc = cv2.VideoWriter_fourcc(*'MP4V')
        dir_save_video = os.path.dirname(path_video.replace('Data', 'Data_processed'))
        if not os.path.exists(dir_save_video):
            os.makedirs(dir_save_video)
        name_save_video = '{}_{}.mp4'.format(os.path.basename(name), index)
        path_save_video = os.path.join(dir_save_video, name_save_video)
        videoWriter = cv2.VideoWriter(path_save_video,fourcc,fps,size)
        for frame in temp_list_frames:
            videoWriter.write(frame)
        videoWriter.release()
        dict_processed['path'].append(name_save_video)
        dict_processed['labels'].append(labels)
    df_processed = pd.DataFrame(dict_processed)
    df_processed.to_csv('processed_data.csv', index=False)
    cap.release()
    cv2.destroyAllWindows()
    print("Finish processing video: {}".format(name))
    

In [None]:
df_label.drop(columns='Time').fillna(0).AV.unique()

In [None]:
len([name for name in df_short_videos.name.tolist() if re.search(r'clean|list|freely', name, re.IGNORECASE)])

In [None]:
len([name for name in df_short_videos.name.tolist() if re.search(r'following', name, re.IGNORECASE)])

In [None]:
set(name for name in df_short_videos.name.tolist() if re.search(r'following', name, re.IGNORECASE)) - set(name for name in df_short_videos.name.tolist() if re.search(r'clean|list|freely', name, re.IGNORECASE))

In [None]:
nums = []
name_12 = []
for name in df_short_videos.name.tolist():
    match = re.search(r'(alone|playtime|instructions)\s*(\d+)', name, re.IGNORECASE)
    if match:
        name_12.append(name)
        nums.append(match.group(2))

In [None]:
np.unique(nums)

In [None]:
df_videos.loc[df_videos.name.isin(name_12)]

In [None]:
[name for name in df_short_videos.name.tolist() if re.search(r'toy|activity|interaction', name, re.IGNORECASE)]

In [None]:
set(name for name in df_short_videos.name.tolist() if re.search(r'following', name, re.IGNORECASE)) - set(name for name in df_short_videos.name.tolist() if re.search(r'clean|list|freely', name, re.IGNORECASE))

{'13th\\LJH\\13th_LJH_Following Instructions.mp4',
 '14th\\JJH\\Following Instructions1.MP4',
 '14th\\JJH\\Following Instructions2.MP4',
 '14th\\LKY\\Following Instructions1.MP4',
 '14th\\LKY\\Following Instructions2.MP4',
 '14th\\NSW\\Following Instructions1.MP4',
 '14th\\NSW\\Following Instructions2.MP4',
 '15th\\HTK\\Following Instructions1.MP4',
 '15th\\HTK\\Following Instructions2.MP4',
 '5th\\KYJ\\5th_KYJ_Following Instructions.mp4'}

In [None]:
nums = []
name_12 = []
for name in df_short_videos.name.tolist():
    match = re.search(r'(alone|playtime|instructions)\s*(\d+)', name, re.IGNORECASE)
    if match:
        name_12.append(name)
        nums.append(match.group(2))

In [None]:
np.unique(nums)

array(['1', '2'], dtype='<U1')

In [None]:
df_videos.loc[df_videos.name.isin(name_12)]

Unnamed: 0,name,duration
36,19th\KDH\Home_Playing Alone2.MP4,3.097261
56,15th\HTK\Following Instructions1.MP4,4.968853
57,14th\NSW\Playtime2.MP4,4.997771
58,14th\JJH\Following Instructions2.MP4,5.002219
59,14th\NSW\Following Instructions1.MP4,5.002776
61,14th\JJH\Following Instructions1.MP4,5.007781
62,14th\JJH\Playtime1.MP4,5.007781
63,14th\NSW\Playtime1.MP4,5.008893
64,14th\LKY\Following Instructions2.MP4,5.022796
65,15th\HTK\Playtime1.MP4,5.027244


In [None]:
[name for name in df_short_videos.name.tolist() if re.search(r'toy|activity|interaction', name, re.IGNORECASE)]

['20th\\BAM\\Hospital_Playtime_New Toys.MP4',
 '21th\\KSJ\\Hospital_Playtime_New Toys.MP4',
 '23th\\JBH\\Hospital_Playtime_New Toys.MP4',
 '21th\\KHW\\Hospital_Playtime_New toys.MP4',
 '22th\\CHT\\Home_Playtime_New Toys.MP4',
 '20th\\SYJ\\Home_Playtime_New Toys.MP4',
 '24th\\UJW\\Home_Playtime_New Toys.MP4',
 '23th\\CEJ\\Home_Playtime_New Toys.MP4',
 '23th\\HSY\\Home_Playtime_New Toys.MP4',
 '24th\\JSM\\Home_Playtime_New Toys.MP4',
 '22th\\LSY\\Home_Playtime_New Toys.MP4',
 '21th\\LJH\\Home_Playtime_New Toys.MP4',
 '21th\\HKJ\\Home_Playtime_New Toys.MP4',
 '20th\\BAM\\Hospital_Playtime_Physical Activity.MP4',
 '21th\\KHW\\Hospital_Playtime_Physical Activity.MP4',
 '21th\\KSJ\\Hospital_Playtime_Physical Activity .MP4',
 '23th\\JBH\\Hospital_Playtime_Physical Activity.MP4',
 '24th\\UJW\\Home_Playtime_Physical Activity.mpg',
 '22th\\CHT\\Home_Playtime_Physical Activity.MP4',
 '20th\\SYJ\\Home_Playtime_Physical Activity.MP4',
 '23th\\CEJ\\Home_Playtime_Physical Activity.MP4',
 '23th\\HSY\\

In [None]:
# Process each good video
for num_video, name in enumerate(df_good_videos.name.tolist()):
    path_video = os.path.join(dir_raw, name)
    path_label = glob.glob(os.path.join(os.path.dirname(path_video), '*.xlsx'))[0]
    names_sheet = pd.ExcelFile(path_label).sheet_names
    if 'Alone' in os.path.basename(path_video):
        df_label = pd.read_excel(path_label, sheet_name="alone" if "alone" in names_sheet else "Alone")
    elif 'Instruction' in os.path.basename(path_video):
        df_label = pd.read_excel(path_label, sheet_name="instruction" if "instruction" in names_sheet else "Instruction")
    elif 'Playtime' in os.path.basename(path_video):
        df_label = pd.read_excel(path_label, sheet_name="playtime" if "playtime" in names_sheet else "Playtime")
    else:
        print("Bad data here, need to fixed: print {}".format(path_video))
        break

    # Find the row that contains all the table head.
    index_head = 0
    index_tail = 0
    for index, col in df_label.iteritems():
        index_head_tail = col[col == 'Time'].index
        if len(index_head_tail) > 0:
            index_head = index_head_tail[0]
            index_tail = index_head_tail[1]
            break

    # Check head and tail error:
    if index_head + index_tail < 20:
        print('Something wrong for the head and tail index.')
        break

    # Change the head of the dataframe
    df_label.columns = df_label.iloc[index_head].values

    # Crop the df_label to keep valuable information
    ind_col_head, ind_col_tail = np.where(df_label.columns == 'Time')[0]
    df_label = df_label.iloc[index_head+1:index_tail]
    df_label = df_label.iloc[:, ind_col_head:ind_col_tail]

    # Change the duplicate header names (eg: Int to Int_parent and Int_child)
    index_Int = np.where(df_label.columns == 'Int')[0]
    df_label.columns.values[index_Int[0]] = 'Int_parent'
    df_label.columns.values[index_Int[1]] = 'Int_child'
    index_Int = np.where(df_label.columns == 'Aff')[0]
    df_label.columns.values[index_Int[0]] = 'Aff_parent'
    df_label.columns.values[index_Int[1]] = 'Aff_child'

    # Preprocess the video:
    cap = cv2.VideoCapture(path_video)
    if cap.isOpened():
        fps = cap.get(5)
        FrameNumber = cap.get(7)
        frame_width = int(cap.get(3))
        frame_height = int(cap.get(4))

        size = (frame_width, frame_height)
        num_frame = 0
        # info_label = df_label.iterrows()
    else:
        break

    for index, row in df_label.iterrows():
        # initial set up for interval
        start_time = row['Time']
        frame_start = start_time.minute * fps + start_time.hour * 60 * fps
        frame_end = 10 * fps + frame_start
        temp_list_frames = []
        # Crop the corresponding video
        while num_frame <= frame_end:
            success,frame = cap.read()
            num_frame += 1
            if not success:
                break
            temp_list_frames.append(frame)
        # Check the labels of this video and save it in the corresponding folder
        labels = [i for i in row.index if row[i]==1]
        for label in labels:
            fourcc = cv2.VideoWriter_fourcc(*'MP4V')
            dir_save_video = os.path.dirname(path_video.replace('Data', 'Data_processed/{}'.format(label)))
            if not os.path.exists(dir_save_video):
                os.makedirs(dir_save_video)
            name_save_video = '{}_{}.mp4'.format(num_video, len(os.listdir(dir_save_video)))
            path_save_video = os.path.join(dir_save_video, name_save_video)
            videoWriter = cv2.VideoWriter(path_save_video,fourcc,fps,size)
            for frame in temp_list_frames:
                videoWriter.write(frame)
            videoWriter.release()
    


In [None]:
np.unique([os.path.basename(i) for i in df_short_videos.name])

array(['Following Instructions1.MP4', 'Following Instructions2.MP4',
       'Home_Following Instructions_Clean Up.MP4',
       'Home_Following Instructions_Clean Up.mp4',
       'Home_Following Instructions_Freely.MP4',
       'Home_Following Instructions_Freely.mpg',
       'Home_Following Instructions_List.MP4', 'Home_Playing Alone.mp4',
       'Home_Playing Alone1.MP4', 'Home_Playing Alone2.MP4',
       'Home_Playtime.mp4', 'Home_Playtime_Casual Interaction.MP4',
       'Home_Playtime_New Toys.MP4',
       'Home_Playtime_Physical Activity.MP4',
       'Home_Playtime_Physical Activity.mpg',
       'Hospital_Following Instructions_Clean Up.MP4',
       'Hospital_Following Instructions_Freely.MP4',
       'Hospital_Following Instructions_List.MP4',
       'Hospital_Playing Alone 1.MP4', 'Hospital_Playing Alone 2.MP4',
       'Hospital_Playtime_Casual Interaction.MP4',
       'Hospital_Playtime_New Toys.MP4', 'Hospital_Playtime_New toys.MP4',
       'Hospital_Playtime_Physical Activity 

In [None]:
df_short_videos.name

Unnamed: 0,name,duration
92,20th\BAM\Hospital_Playtime_New Toys.MP4,2.013122
86,20th\BAM\Hospital_Following Instructions_Clean...,2.044264
120,21th\KSJ\Hospital_Playtime_New Toys.MP4,2.046489
157,23th\JBH\Hospital_Following Instructions_Clean...,2.064284
115,21th\KSJ\Hospital_Following Instructions_Clean...,2.074851
...,...,...
180,5th\LCD\Home_Playtime.mp4,5.373702
123,21th\LJH\Home_Following Instructions_Freely.MP4,5.761867
87,20th\BAM\Hospital_Following Instructions_Freel...,6.006556
80,19th\KDH\Home_Playing Alone1.MP4,7.558940


In [None]:
[i for i in df_short_videos.name if 'Alone' in i]

['5th\\LCD\\Home_Playing Alone.mp4',
 '19th\\KDH\\Home_Playing Alone2.MP4',
 '20th\\BAM\\Hospital_Playing Alone 1.MP4',
 '20th\\BAM\\Hospital_Playing Alone 2.MP4',
 '19th\\KDH\\Home_Playing Alone1.MP4']

In [None]:
np.sum(df_videos.duration)

1567.6955324296296