In [None]:
import os, cv2, copy
import numpy as np
import pandas as pd
import shutil
from sklearn.model_selection import train_test_split
from pytorchvideo.data.encoded_video import EncodedVideo

In [None]:
# Define the dataset folder
dir_10s = r'C:\Users\36394\Study\GWU\PHD in Biomedical Engineer\Research\FOS\Autism_dataset\Data_processed'
dir_vision = r'C:\Users\36394\Study\GWU\PHD in Biomedical Engineer\Research\FOS\Autism_dataset\Vision_dataset'
names_vision = ['C+', 'C-', 'PN', 'EA']

In [None]:
df_vision = pd.DataFrame(columns=['subject_name', 'video_path', 'label'])
bad_videos = []
for name in names_vision:
    g = os.walk(os.path.join(dir_10s, name))

    for path,dir_list,file_list in g:
        for file_name in file_list:
            video_path = os.path.join(path, file_name)
            try:
                video = EncodedVideo.from_path(video_path)
                subject_name = path.split('Data_processed')[1]
                label = name
                df_vision.loc[len(df_vision.index)] = [subject_name, video_path, label]
            except:
                path_bad_video = video_path.split('Data_processed')[1]
                print("Bad video: {}, We can't open it!".format(path_bad_video))
                bad_videos.append(path_bad_video)
            

In [None]:
len(bad_videos), len(df_vision)

In [None]:
print(df_vision.label.value_counts())
df_C_plus = df_vision[df_vision['label'] == 'C+']
df_C_minus = df_vision[df_vision['label'] == 'C-']
df_PN = df_vision[df_vision['label'] == 'PN']
df_EA = df_vision[df_vision['label'] == 'EA']

In [None]:
# Handle the data imbalance problem
# Random drop some samples for EA class
remove_n = 1461
drop_indices = np.random.choice(df_EA.index, remove_n, replace=False)
df_EA_modified = df_EA.drop(drop_indices)

# Keep the C+ class as it is
df_C_plus_modified = df_C_plus
# # Random drop some samples for C+ class
# remove_n = 6
# drop_indices = np.random.choice(df_C_plus.index, remove_n, replace=False)
# df_C_plus_modified = df_C_plus.drop(drop_indices)

# Oversample the C- class for 63 times
df_C_minus_modified = pd.concat([df_C_minus]*63, ignore_index=True)

# Oversample the PN class for 30 times
df_PN_modified = pd.concat([df_PN]*30, ignore_index=True)

df_vision_modified = pd.concat([df_C_plus_modified, df_C_minus_modified, df_PN_modified, df_EA_modified], ignore_index=True)
print(df_vision_modified.label.value_counts())

In [None]:
dir_train = os.path.join(dir_vision, 'train')
dir_val = os.path.join(dir_vision, 'val')
if not os.path.exists(dir_train): os.mkdir(dir_train)
if not os.path.exists(dir_val): os.mkdir(dir_val)

# Split the dataset into train and val based on the label
df_train, df_val = train_test_split(df_vision_modified, test_size=0.2, stratify=df_vision_modified['label'], random_state=42)
print(df_train.label.value_counts())
print(df_val.label.value_counts())

# Store the train and val dataset
df_train.to_csv('train.csv', index=False)
df_val.to_csv('val.csv', index=False)

# Old way to split the dataset

In [None]:
dir_train = os.path.join(dir_vision, 'train')
dir_val = os.path.join(dir_vision, 'val')
if not os.path.exists(dir_train): os.mkdir(dir_train)
if not os.path.exists(dir_val): os.mkdir(dir_val)

durations = []
for name in names_vision:
    g = os.walk(os.path.join(dir_10s, name))
    dir_target_train = os.path.join(dir_train, name)
    dir_target_val = os.path.join(dir_val, name)
    if not os.path.exists(dir_target_train): os.mkdir(dir_target_train)
    if not os.path.exists(dir_target_val): os.mkdir(dir_target_val)

    paths_video = []
    for path,dir_list,file_list in g:
        for file_name in file_list:
            path_video = os.path.join(path, file_name)
            paths_video.append(path_video)
    for num, path_video in enumerate(paths_video):
        cap = cv2.VideoCapture(path_video)
        if cap.isOpened():
            rate = cap.get(5)
            FrameNumber = cap.get(7)
            duration = (FrameNumber/rate)
            durations.append(duration)


In [None]:
from sklearn.model_selection import train_test_split

dir_train = os.path.join(dir_vision, 'train')
dir_val = os.path.join(dir_vision, 'val')
if not os.path.exists(dir_train): os.mkdir(dir_train)
if not os.path.exists(dir_val): os.mkdir(dir_val)

for name in names_vision:
    g = os.walk(os.path.join(dir_10s, name))
    dir_target_train = os.path.join(dir_train, name)
    dir_target_val = os.path.join(dir_val, name)
    if not os.path.exists(dir_target_train): os.mkdir(dir_target_train)
    if not os.path.exists(dir_target_val): os.mkdir(dir_target_val)

    paths_video = []
    for path,dir_list,file_list in g:
        for file_name in file_list:
            path_video = os.path.join(path, file_name)
            paths_video.append(path_video)
    paths_train, paths_val = train_test_split(paths_video, test_size=0.2)
    for num, path_train in enumerate(paths_train):
        shutil.copyfile(path_train, os.path.join(dir_target_train, '{}_{}.mp4'.format(name, num)))
    for num, path_val in enumerate(paths_val):
        shutil.copyfile(path_val, os.path.join(dir_target_val, '{}_{}.mp4'.format(name, num)))

In [3]:
from sklearn.model_selection import train_test_split

dir_train = os.path.join(dir_vision, 'train')
dir_val = os.path.join(dir_vision, 'val')
if not os.path.exists(dir_train): os.mkdir(dir_train)
if not os.path.exists(dir_val): os.mkdir(dir_val)

for name in names_vision:
    g = os.walk(os.path.join(dir_10s, name))
    dir_target_train = os.path.join(dir_train, name)
    dir_target_val = os.path.join(dir_val, name)
    if not os.path.exists(dir_target_train): os.mkdir(dir_target_train)
    if not os.path.exists(dir_target_val): os.mkdir(dir_target_val)

    paths_video = []
    for path,dir_list,file_list in g:
        for file_name in file_list:
            path_video = os.path.join(path, file_name)
            paths_video.append(path_video)
    paths_train, paths_val = train_test_split(paths_video, test_size=0.2)
    for num, path_train in enumerate(paths_train):
        shutil.copyfile(path_train, os.path.join(dir_target_train, '{}_{}.mp4'.format(name, num)))
    for num, path_val in enumerate(paths_val):
        shutil.copyfile(path_val, os.path.join(dir_target_val, '{}_{}.mp4'.format(name, num)))