In [1]:
import pandas as pd
import glob
import numpy as np
from tsfresh import extract_features, extract_relevant_features
%matplotlib inline

In [2]:
import os
cdir = os.getcwd()
print(cdir)
os.chdir(cdir + '/train/train')
print(os.getcwd())

/home/nabil/ml_contests/cooking_activity/cook2020_tutorials
/home/nabil/ml_contests/cooking_activity/cook2020_tutorials/train/train


In [3]:
# we will do a separate analysis for each folders, to understand which timeseries are more reliable
# we will extract as many features as possible, then try to visualize them across subjects to find correlation
# we will do a subject wise nested CV to understand, if the models generalize across subjects

In [4]:
# there are 3 subjects
subject1 = {}
subject2 = {}
subject3 = {}

In [5]:
# we will load every single right arm data, separate based on subject id, do feature extraction, run t-SNE

data_folder = 'right_arm/*.csv' 
files = glob.glob(data_folder)

subject1['right_arm'] = pd.DataFrame()
subject2['right_arm'] = pd.DataFrame()
subject3['right_arm'] = pd.DataFrame()


# id is generated by concatenating INT(subject+trial)

for f in files:
    #print(f)
    if f.split('/')[1].split('_')[0] == 'subject1':
        
        ra = pd.read_csv(f)
        ra['id'] = int(f.split('/')[1].split('_')[0][-1] + f.split('/')[1].split('_')[2].split('.')[0])
        #print(ra['id'])
        
        if len(subject1['right_arm']) > 0:
            subject1['right_arm'] = subject1['right_arm'].append(ra)
        else:
            subject1['right_arm'] = ra
            
    elif f.split('/')[1].split('_')[0] == 'subject2':
        
        ra = pd.read_csv(f)
        ra['id'] = int(f.split('/')[1].split('_')[0][-1] + f.split('/')[1].split('_')[2].split('.')[0])
        #print(ra['id'])
        
        if len(subject2['right_arm']) > 0:
            subject2['right_arm'] = subject2['right_arm'].append(ra)
        else:
            subject2['right_arm'] = ra
            
    elif f.split('/')[1].split('_')[0] == 'subject3':
        
        ra = pd.read_csv(f)
        ra['id'] = int(f.split('/')[1].split('_')[0][-1] + f.split('/')[1].split('_')[2].split('.')[0])
        #print(ra['id'])

        
        if len(subject3['right_arm']) > 0:
            subject3['right_arm'] = subject3['right_arm'].append(ra)
        else:
            subject3['right_arm'] = ra
    else:
        print('either new subject or a bug')

In [6]:
# let's see what have we got
print(subject1['right_arm'].head())
print(subject2['right_arm'].head())
print(subject3['right_arm'].head())

print(subject1['right_arm'].tail())
print(subject2['right_arm'].tail())
print(subject3['right_arm'].tail())

        X       Y       Z  timestamp    id
0  0.1169  0.0682 -0.1362        118  1320
1  0.0331 -0.0348 -0.0791        138  1320
2 -0.0002 -0.1512 -0.1131        159  1320
3 -0.0360  0.0065 -0.1294        178  1320
4 -0.1051  0.0081 -0.0464        198  1320
        X       Y       Z  timestamp    id
0 -0.1340 -0.0097  0.2692         13  2650
1  0.0661  0.1543  0.0914         33  2650
2  0.2757  0.1176 -0.0744         54  2650
3  0.2641 -0.0109  0.0946         73  2650
4  0.2336 -0.1483  0.3131         93  2650
        X       Y       Z  timestamp    id
0  0.8023  0.1439 -0.5614       1023  3949
1 -0.0208 -0.3623 -0.8656       1024  3949
2 -0.3820 -0.1537 -1.3098       1025  3949
3 -0.0498 -0.7276 -1.1890       1026  3949
4  0.1985 -0.4648 -0.8981       1027  3949
           X       Y       Z  timestamp    id
1445 -0.0725  0.4003 -0.3416      29899  1243
1446  0.1007  0.4282 -0.3135      29920  1243
1447 -0.0187  0.4065 -0.1889      29939  1243
1448 -0.2578  0.3971 -0.0402      29960  1

In [7]:
labels = pd.read_csv("labels.txt", sep=' ', header=None)
print(labels.head())
labels = labels[0].str.split(",", n=2, expand=True)
labels.columns = ['file_id', 'macro', 'micro'] #give names to the columns
labels.index = labels['file_id'] #use the file id as index to make it searchable by file_id
print(labels.head())

                                           0
0           subject2_file_457,sandwich,Take,
1      subject2_file_679,sandwich,Wash,Take,
2        subject2_file_95,sandwich,Cut,Wash,
3  subject2_file_899,sandwich,other,Cut,Put,
4            subject2_file_368,sandwich,Put,
                             file_id     macro           micro
file_id                                                       
subject2_file_457  subject2_file_457  sandwich           Take,
subject2_file_679  subject2_file_679  sandwich      Wash,Take,
subject2_file_95    subject2_file_95  sandwich       Cut,Wash,
subject2_file_899  subject2_file_899  sandwich  other,Cut,Put,
subject2_file_368  subject2_file_368  sandwich            Put,


In [8]:
labels[labels['file_id']=='subject1_file_11']

Unnamed: 0_level_0,file_id,macro,micro
file_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
subject1_file_11,subject1_file_11,sandwich,"other,Put,"


In [9]:
# label generation
subject1['label_mac'] = pd.DataFrame()
subject2['label_mac'] = pd.DataFrame()
subject3['label_mac'] = pd.DataFrame()

for i in range(len(labels)):
    #print(labels.iloc[i]['file_id'])
    #print(labels.iloc[i]['file_id'].split('_')[0][-1])
    tid = int(labels.iloc[i]['file_id'].split('_')[0][-1] + labels.iloc[i]['file_id'].split('_')[-1])
    #print(tid)
    label = labels.iloc[i]['macro']
    #print(label)
    if labels.iloc[i]['file_id'].split('_')[0][-1] == '1':
        subject1['label_mac'] = subject1['label_mac'].append(pd.DataFrame({"id":[tid], "label":[label]}))
    elif labels.iloc[i]['file_id'].split('_')[0][-1] == '2':
        subject2['label_mac'] = subject2['label_mac'].append(pd.DataFrame({"id":[tid], "label":[label]}))
    elif labels.iloc[i]['file_id'].split('_')[0][-1] == '3':
        subject3['label_mac'] = subject3['label_mac'].append(pd.DataFrame({"id":[tid], "label":[label]}))
    else:
        print('some bug')

In [10]:
subject1['label_mac']

Unnamed: 0,id,label
0,1622,sandwich
0,1869,sandwich
0,1807,sandwich
0,1463,sandwich
0,1830,sandwich
...,...,...
0,1770,cereal
0,1211,cereal
0,1713,cereal
0,1462,cereal


In [11]:
subject2['label_mac']

Unnamed: 0,id,label
0,2457,sandwich
0,2679,sandwich
0,295,sandwich
0,2899,sandwich
0,2368,sandwich
...,...,...
0,2616,cereal
0,2642,cereal
0,2721,cereal
0,2199,cereal


In [12]:
subject3['label_mac']

Unnamed: 0,id,label
0,3867,sandwich
0,3537,sandwich
0,3753,sandwich
0,3335,sandwich
0,3718,sandwich
...,...,...
0,3557,cereal
0,3977,cereal
0,3402,cereal
0,3854,cereal


In [13]:
# subject wise distribution
print(len(set(subject1['right_arm']['id'])))
print(len(set(subject2['right_arm']['id'])))
print(len(set(subject3['right_arm']['id'])))

80
105
103


### TO-DO
placing missing values

In [14]:
subject1['right_arm']

Unnamed: 0,X,Y,Z,timestamp,id
0,0.1169,0.0682,-0.1362,118,1320
1,0.0331,-0.0348,-0.0791,138,1320
2,-0.0002,-0.1512,-0.1131,159,1320
3,-0.0360,0.0065,-0.1294,178,1320
4,-0.1051,0.0081,-0.0464,198,1320
...,...,...,...,...,...
1445,-0.0725,0.4003,-0.3416,29899,1243
1446,0.1007,0.4282,-0.3135,29920,1243
1447,-0.0187,0.4065,-0.1889,29939,1243
1448,-0.2578,0.3971,-0.0402,29960,1243


In [15]:
sub1_label_dict = {}

for i in range(len(subject1['label_mac'])):
    tid = subject1['label_mac'].iloc[i]['id']
    label = subject1['label_mac'].iloc[i]['label']
    sub1_label_dict[tid] = label
print(sub1_label_dict)

{1622: 'sandwich', 1869: 'sandwich', 1807: 'sandwich', 1463: 'sandwich', 1830: 'sandwich', 1272: 'sandwich', 1382: 'sandwich', 1588: 'sandwich', 1140: 'sandwich', 1213: 'sandwich', 1685: 'sandwich', 1389: 'sandwich', 1247: 'sandwich', 1808: 'sandwich', 1238: 'sandwich', 1985: 'sandwich', 1499: 'sandwich', 1980: 'sandwich', 1310: 'sandwich', 1793: 'sandwich', 1618: 'sandwich', 1780: 'sandwich', 1313: 'sandwich', 1958: 'sandwich', 198: 'sandwich', 1424: 'sandwich', 1964: 'sandwich', 1884: 'sandwich', 1263: 'sandwich', 1784: 'sandwich', 1684: 'sandwich', 1968: 'sandwich', 1786: 'sandwich', 1243: 'sandwich', 1957: 'sandwich', 1319: 'sandwich', 1853: 'sandwich', 111: 'sandwich', 1814: 'sandwich', 1873: 'fruitsalad', 1513: 'fruitsalad', 1721: 'fruitsalad', 1204: 'fruitsalad', 1768: 'fruitsalad', 1254: 'fruitsalad', 1536: 'fruitsalad', 196: 'fruitsalad', 137: 'fruitsalad', 1281: 'fruitsalad', 1849: 'fruitsalad', 1485: 'fruitsalad', 1496: 'fruitsalad', 1564: 'fruitsalad', 1871: 'fruitsalad', 1

In [16]:
macro_map = {'sandwich':0, 'fruitsalad':1, 'cereal':2}

In [17]:
# conversion phase for getting relevant features
subject1['right_arm'] = subject1['right_arm'].sort_values(by=['id', 'timestamp'])
print(subject1['right_arm'])

id_set = sorted(list(set(subject1['right_arm']['id'])))


#print(id_set)
sub1_lab_mac = []

cnt = 0

for tid in id_set:
    subject1['right_arm']['id'].replace(tid, cnt, inplace = True)
    sub1_lab_mac.append(macro_map[sub1_label_dict[tid]])
    cnt+=1

sub1_lab_mac = pd.Series(sub1_lab_mac)
print(sub1_lab_mac)


sub1_features = extract_relevant_features(subject1['right_arm'], sub1_lab_mac,
                                          column_id='id', column_sort='timestamp')

           X       Y       Z  timestamp    id
0     0.0816 -0.3178  0.4107         18   111
1     0.1532 -0.2166  0.3348         40   111
2     0.2534 -0.1508  0.2158         59   111
3     0.2679 -0.2268  0.3008         79   111
4     0.2211 -0.0548  0.2310         99   111
...      ...     ...     ...        ...   ...
1373 -0.1092 -0.0142 -0.0118      29900  1985
1374  0.2264  0.1388 -0.0673      29920  1985
1375  0.5733  0.2901 -0.2565      29941  1985
1376  0.6581  0.3253 -0.2315      29960  1985
1377  0.7201  0.1352 -0.1009      29980  1985

[104701 rows x 5 columns]
0     0
1     1
2     2
3     2
4     2
     ..
75    2
76    2
77    2
78    0
79    0
Length: 80, dtype: int64


Feature Extraction: 100%|██████████| 30/30 [02:29<00:00,  4.97s/it]


In [21]:
sub1_features

variable
id
0
1
2
3
4
...
75
76
77
78


In [20]:
len(subject3['right_arm'])

121732