Objective: modify the data used for BERT encoding to one-hot encoded data

1. Role
2. Talking
3. Seating position

In [1]:
!pwd

/c/Users/990924


In [2]:
DATASET_DIR = "C:/Users/990924/OneDrive/code/GNN_TGN_V3/data/DataV3"

In [3]:
import os
os.chdir(DATASET_DIR)
!pwd

/c/Users/990924/OneDrive/code/GNN_TGN_V3/data/DataV3


In [None]:
import numpy as np
import pandas as pd
# from tqdm import tqdm_notebook as tqdm
from sklearn.preprocessing import OneHotEncoder

from _date_prep_onehot import *

In [176]:
# one hot encoding
ROLES = ['student', 'teacher', 'musician', 'music teacher']
POSITION = ['nearby', 'away from', 'opposite to']

In [149]:
def itemize_df(session_df):
    session_df = _role_str2list(session_df)
    session_df = _talking_str2list(session_df)
    session_df = _position_str2list(session_df)

    session_df = _rename_df(session_df)

    return session_df

In [153]:
def onehot_t1(session_df):
    session_df = session_df.drop(['angle_w'], axis=1) # drop unused cols

    role_enc = OneHotEncoder(categories=[ROLES])
    pos_enc = OneHotEncoder(categories=[POSITION])
    
    aa = role_enc.fit_transform(session_df[['src_type']]).toarray()
    aa = pd.DataFrame(aa, columns=[f'src_type-{x}'for x in range(aa.shape[1])])
    bb = role_enc.fit_transform(session_df[['dst_type']]).toarray()
    bb = pd.DataFrame(bb, columns=[f'dst_type-{x}'for x in range(bb.shape[1])])
    cc = pos_enc.fit_transform(session_df[['position_type']]).toarray()
    cc = pd.DataFrame(cc, columns=[f'position_type-{x}'for x in range(cc.shape[1])])

    # drop converted cols
    session_df = session_df.drop(['src_type', 'dst_type', 'position_type'], axis=1)

    session_df = pd.concat([session_df, aa, bb, cc], axis=1)
    session_df = _talking_enc_t1(session_df)
    
    return session_df

In [148]:
session_dirs = [x for x in os.listdir(".") if "_D" in x]
session_dirs

['L1_D1_20220621_PM',
 'L1_D1_20220626_AM',
 'L1_D1_20220626_PM',
 'L1_D3_20220626_AM',
 'L1_D3_20220626_PM',
 'L2_D2_20220620_PM',
 'L2_D2_20220622_PM',
 'L2_D2_20220624_PM',
 'L2_D4_20220622_PM',
 'L2_D4_20220624_PM',
 'L3_D5_20220619_AM',
 'L3_D5_20220619_PM',
 'L3_D5_20220623_PM',
 'L3_D7_20220619_AM',
 'L3_D7_20220619_PM',
 'L3_D7_20220623_PM',
 'L4_D6_20220627_PM',
 'L4_D6_20220630_PM',
 'L4_D8_20220627_PM',
 'L4_D8_20220629_PM',
 'L4_D8_20220630_PM']

In [159]:
for sidx, session_dir in enumerate(session_dirs):
    print(f"#{sidx}: {session_dir}")
    raw_dir = os.path.join(session_dir, "feat-desc_non-negative", "raw")
    csv_path = os.path.join(raw_dir, "feat-desc_non-negative.csv")
    df = pd.read_csv(csv_path)

    df = itemize_df(df)
    itemized_csv = os.path.join(raw_dir, "feat-desc_non-negative-itemized.csv")
    # save itemized
    df.to_csv(itemized_csv, index=False)
    
    df = onehot_t1(df)
    onehot_t1_csv = os.path.join(raw_dir, "feat-desc_non-negative-onehot_t1.csv")
    # save onehot encoded
    df.to_csv(onehot_t1_csv, index=False)

df.head()

#0: L1_D1_20220621_PM
#1: L1_D1_20220626_AM
#2: L1_D1_20220626_PM
#3: L1_D3_20220626_AM
#4: L1_D3_20220626_PM
#5: L2_D2_20220620_PM
#6: L2_D2_20220622_PM
#7: L2_D2_20220624_PM
#8: L2_D4_20220622_PM
#9: L2_D4_20220624_PM
#10: L3_D5_20220619_AM
#11: L3_D5_20220619_PM
#12: L3_D5_20220623_PM
#13: L3_D7_20220619_AM
#14: L3_D7_20220619_PM
#15: L3_D7_20220623_PM
#16: L4_D6_20220627_PM
#17: L4_D6_20220630_PM
#18: L4_D8_20220627_PM
#19: L4_D8_20220629_PM
#20: L4_D8_20220630_PM


Unnamed: 0,recording,second,src_id,dst_id,speakers,src_type-0,src_type-1,src_type-2,src_type-3,dst_type-0,dst_type-1,dst_type-2,dst_type-3,position_type-0,position_type-1,position_type-2,src_talking_status,dst_talking_status,others_talking_status
0,L4_D8_20220630_PM,0.36,1,10,"[1, 0]",1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1,0,1
1,L4_D8_20220630_PM,0.37,1,10,"[1, 0]",1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1,0,1
2,L4_D8_20220630_PM,0.38,1,10,"[1, 0]",1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1,0,1
3,L4_D8_20220630_PM,0.64,1,10,[],1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0,0,0
4,L4_D8_20220630_PM,1.65,1,10,[],1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0,0,0


# Archit

- `preProcess.py`: generate torch datatype through `MyOwnDataset` from `dataset_new_features.py`
- `dataset_new_features.py`: define `MyOwnDataset`, raw message generation in `self.process()`