# Step 0: Segment original FOS dataset to the 10s video clips

I have modeified "seg_to_10s_av_clips.py", just run the code "seg_to_10s_av_clips.py" to segment the original FOS dataset to the 10s video clips. Keep the corresponding labels in the CSV file.

# Step 1: Extract audio track and image frames from the video

## Step 1.1: build the 10s video clips list in CSV format

In [1]:
import numpy as np
import pandas as pd
import os
from ast import literal_eval

In [2]:
df_10s = pd.read_csv(r'10s_av_clips_with_labels.csv')
df_paths_clip = pd.DataFrame([os.path.normpath(path) for path in df_10s['path'].values], columns=['path'])
df_paths_clip.to_csv('paths_clip.csv', header=False, index=False)
df_paths_clip = pd.read_csv('paths_clip.csv')
df_paths_clip

Unnamed: 0,C:\Users\36394\Study\GWU\PHD in Biomedical Engineer\Research\FOS\Autism_dataset\Data_10s_clips\20th_BAM_Hospital_Playtime_New Toys_0.mp4
0,C:\Users\36394\Study\GWU\PHD in Biomedical Eng...
1,C:\Users\36394\Study\GWU\PHD in Biomedical Eng...
2,C:\Users\36394\Study\GWU\PHD in Biomedical Eng...
3,C:\Users\36394\Study\GWU\PHD in Biomedical Eng...
4,C:\Users\36394\Study\GWU\PHD in Biomedical Eng...
...,...
8102,C:\Users\36394\Study\GWU\PHD in Biomedical Eng...
8103,C:\Users\36394\Study\GWU\PHD in Biomedical Eng...
8104,C:\Users\36394\Study\GWU\PHD in Biomedical Eng...
8105,C:\Users\36394\Study\GWU\PHD in Biomedical Eng...


## Step 1.2: extract audio track and image frames from the videos dataset
I have modified the
"\src\preprocess\extract_audio.py" and "\src\preprocess\extract_video_frame.py"
to fit our FOS dataset.
In the terminal, run the following command to extract audio track and image frames from the video:
```bash
python src/preprocess/extract_audio.py
  -input_file_list "C:\Users\36394\PycharmProjects\cav-mae\zzh_code\paths_clip.csv"
  -target_fold "C:\Users\36394\Study\GWU\PHD in Biomedical Engineer\Research\FOS\Autism_dataset\cav_mae_processed_data\audio_10s"

python src/preprocess/extract_video_frame.py
  -input_file_list "C:\Users\36394\PycharmProjects\cav-mae\zzh_code\paths_clip.csv"
  -target_fold "C:\Users\36394\Study\GWU\PHD in Biomedical Engineer\Research\FOS\Autism_dataset\cav_mae_processed_data\RGB_10_frames"
```

# Step 2: Determine the used labels and segment to the train adn validation sets

## Step 2.1: Determine the used labels

In [3]:
# The whole FOS labels
dict_FOS_labels = {
    'AD': 'Adersive demand',
    'AV': 'Appropriate verbal interactions',
    'Aff_child': 'Children affection',
    'Aff_parent': 'Parent affection',
    'C+': 'Positive contact',
    'C-': 'Negative contact',
    'CP': 'Complaint',
    'EA': 'Engaged activity of play',
    'Int_child': 'Children interrupt',
    'Int_parent': 'Parent interrupt',
    'MI': 'Mild instruction',
    'NC': 'Non-compliance',
    'O': 'Oppositional',
    'P': 'Praise',
    'PN': 'Physical negative',
    'Q+': 'Positive question',
    'Q-': 'Negative question',
    'S+': 'Positive social attention',
    'S-': 'Negative social attention',
    'SI+': 'Positive specific instruction',
    'SI-': 'Negative specific instruction',
    'VI+': 'Positive vague instruction',
    'VI-': 'Negative vague instruction'
}
labels = list(dict_FOS_labels.keys())
# Create label dataframe
df_FOS_label = pd.DataFrame({
    'index': list(range(0, len(dict_FOS_labels))),  # 0-based index
    'mid': list(dict_FOS_labels.keys()),  # The IS code in the FOS dataset
    'display_name': list(dict_FOS_labels.values())  # The IS in the FOS dataset
})
# Store and display the label dataframe
df_FOS_label.to_csv('FOS_label.csv', index=False)
df_FOS_label

Unnamed: 0,index,mid,display_name
0,0,AD,Adersive demand
1,1,AV,Appropriate verbal interactions
2,2,Aff_child,Children affection
3,3,Aff_parent,Parent affection
4,4,C+,Positive contact
5,5,C-,Negative contact
6,6,CP,Complaint
7,7,EA,Engaged activity of play
8,8,Int_child,Children interrupt
9,9,Int_parent,Parent interrupt


In [4]:
# Only keep the labels that have more than 100 occurrences
labels_used = [] # store the labels that have more than 100 occurrences
print("The number of labels we have is {}.".format(len(labels)))
print()

print("The counts of each label are:")
for index, label in enumerate(labels):
    df_10s[label] = df_10s.apply(lambda row: 1 if label in literal_eval(row['labels']) else 0, axis=1)
    occurrences = np.sum(df_10s[label] == 1)
    if occurrences > 100:
        labels_used.append(label)

    if (index+1) % 4 == 0:
        print("{:>10}: {:<10}".format(label, occurrences), end = "\n")
    else:
        print("{:>10}: {:<10}".format(label, occurrences), end = "\t")

print("\n")
print("The number of labels we used is {}.".format(len(labels_used)))
print("The labels we used are {}.".format(labels_used))
print("The number of labels we dropped is {}.".format(len(labels) - len(labels_used)))
print("The labels we dropped are {}.".format(list(set(labels) - set(labels_used))))

The number of labels we have is 23.

The counts of each label are:
        AD: 41        	        AV: 1464      	 Aff_child: 24        	Aff_parent: 329       
        C+: 2223      	        C-: 15        	        CP: 178       	        EA: 3630      
 Int_child: 1         	Int_parent: 1         	        MI: 185       	        NC: 150       
         O: 2511      	         P: 332       	        PN: 72        	        Q+: 1586      
        Q-: 4         	        S+: 5086      	        S-: 13        	       SI+: 799       
       SI-: 13        	       VI+: 2983      	       VI-: 20        	

The number of labels we used is 13.
The labels we used are ['AV', 'Aff_parent', 'C+', 'CP', 'EA', 'MI', 'NC', 'O', 'P', 'Q+', 'S+', 'SI+', 'VI+'].
The number of labels we dropped is 10.
The labels we dropped are ['SI-', 'Int_child', 'AD', 'C-', 'Q-', 'Int_parent', 'Aff_child', 'S-', 'PN', 'VI-'].


## Step 2.2: Segment the dataset to the train and validation sets

In [5]:
# Split the dataset into training and testing dataset. multi-label iterative stratified sampling
from skmultilearn.model_selection import iterative_train_test_split

# extract the features and used labels from the original csv to numpy arrays
Y = df_10s[labels_used].to_numpy()
X = df_10s.iloc[:, :2].to_numpy()

# split the dataset into training and testing dataset
X_train, Y_train, X_test, Y_test = iterative_train_test_split(X, Y, test_size = 0.2)

# save the training and testing dataset into csv files
head = df_10s.columns[:2].tolist()+labels_used
df_train = pd.DataFrame(np.concatenate((X_train, Y_train), axis=1), columns=head)
df_train['used_label'] = df_train.apply(lambda row: [col for col in labels_used if row[col] == 1], axis=1)
df_validation = pd.DataFrame(np.concatenate((X_test, Y_test), axis=1), columns=head)
df_validation['used_label'] = df_validation.apply(lambda row: [col for col in labels_used if row[col] == 1], axis=1)
df_train.to_csv(r'train.csv', index=False)
df_validation.to_csv(r'validation.csv', index=False)

# print the counts of each label in the training and testing dataset
print("The counts of each label in the are train set are: ")
for index, label in enumerate(labels_used):
    occurrences = np.sum(df_train[label] == 1)

    if (index+1) % 4 == 0:
        print("{:>10}: {:<10}".format(label, occurrences), end = "\n")
    else:
        print("{:>10}: {:<10}".format(label, occurrences), end = "\t")
print()
print("The counts of each label in the are validation set are: ")
for index, label in enumerate(labels_used):
    occurrences = np.sum(df_validation[label] == 1)

    if (index+1) % 4 == 0:
        print("{:>10}: {:<10}".format(label, occurrences), end = "\n")
    else:
        print("{:>10}: {:<10}".format(label, occurrences), end = "\t")

The counts of each label in the are train set are: 
        AV: 1171      	Aff_parent: 264       	        C+: 1791      	        CP: 142       
        EA: 2904      	        MI: 156       	        NC: 124       	         O: 2009      
         P: 265       	        Q+: 1264      	        S+: 4069      	       SI+: 644       
       VI+: 2396      	
The counts of each label in the are validation set are: 
        AV: 293       	Aff_parent: 65        	        C+: 432       	        CP: 36        
        EA: 726       	        MI: 29        	        NC: 26        	         O: 502       
         P: 67        	        Q+: 322       	        S+: 1017      	       SI+: 155       
       VI+: 587       	

# Step 3: Build a FOS label csv and json files for the FOS dataset.

## Step 3.1: Build a FOS label csv.

In [6]:
dict_used_label = {key: dict_FOS_labels[key] for key in labels_used if key in dict_FOS_labels}
dict_used_label

{'AV': 'Appropriate verbal interactions',
 'Aff_parent': 'Parent affection',
 'C+': 'Positive contact',
 'CP': 'Complaint',
 'EA': 'Engaged activity of play',
 'MI': 'Mild instruction',
 'NC': 'Non-compliance',
 'O': 'Oppositional',
 'P': 'Praise',
 'Q+': 'Positive question',
 'S+': 'Positive social attention',
 'SI+': 'Positive specific instruction',
 'VI+': 'Positive vague instruction'}

In [7]:
# Create label dataframe
df_FOS_used_label = pd.DataFrame({
    'index': list(range(0, len(dict_used_label))),  # 0-based index
    'mid': list(dict_used_label.keys()),  # The IS code in the FOS dataset
    'display_name': list(dict_used_label.values())  # The IS in the FOS dataset
})
# Store and display the label dataframe
df_FOS_used_label.to_csv('FOS_used_label.csv', index=False)
df_FOS_used_label

Unnamed: 0,index,mid,display_name
0,0,AV,Appropriate verbal interactions
1,1,Aff_parent,Parent affection
2,2,C+,Positive contact
3,3,CP,Complaint
4,4,EA,Engaged activity of play
5,5,MI,Mild instruction
6,6,NC,Non-compliance
7,7,O,Oppositional
8,8,P,Praise
9,9,Q+,Positive question


## Step 2.2: Build the FOS train and validation label json

In [8]:
import json, os

# Change the data dictionary if we used server to run the code
dir_10_frames = r"C:\Users\36394\Study\GWU\PHD in Biomedical Engineer\Research\FOS\Autism_dataset\cav_mae_processed_data\RGB_10_frames"
dir_audio = r'C:\Users\36394\Study\GWU\PHD in Biomedical Engineer\Research\FOS\Autism_dataset\cav_mae_processed_data\audio_10s'

def build_json_FOS_dataset(dir_10_frames, dir_audio, df_FOS_10s_with_label, save_path):
    # Create the empty json dataset
    json_FOS_dataset = []

    # Convert the 10s video clips with labels to the json format
    for index, row in df_FOS_10s_with_label.iterrows():
        # Get the video id, labels and the path of the audio track
        video_id = os.path.basename(row['path'][:-4])
        labels = literal_eval(row['used_label'])
        labels = ', '.join(labels)
        path_audio = os.path.join(dir_audio, video_id + '.wav')

        # Create the new entry for the json file
        new_entry = {
            'video_id': video_id,
            'wav': path_audio,
            'video_path': dir_10_frames,
            'labels': labels
        }

        # Append the new entry to the json dataset
        json_FOS_dataset.append(new_entry)
    json_FOS_dataset = {'data': json_FOS_dataset}

    # save the json file
    with open(save_path, 'w') as file:
        json.dump(json_FOS_dataset, file, indent=4)


In [9]:
# Save the FOS train and validation dataset to json files
save_path = 'FOS_train_dataset.json'
df_train = pd.read_csv(r'train.csv')
build_json_FOS_dataset(dir_10_frames, dir_audio, df_train, save_path)
save_path = 'FOS_validation_dataset.json'
df_validation = pd.read_csv(r'validation.csv')
build_json_FOS_dataset(dir_10_frames, dir_audio, df_validation, save_path)