In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.svm import SVC
import os.path

norm = preprocessing.MinMaxScaler()

# Users

There are missing values for the gender (represented as '-'). I normalize the columns and compute the distance matrix.


In [2]:
USER_FILE = "../datasets/Mobi_Users.csv"

users = pd.read_csv(USER_FILE,usecols=(3, 4, 5, 6))
users.replace(('M', 'F', '-'),(1,0,0.5), inplace=True)
users.head()

Unnamed: 0,Age,Height,Weight,Gender
0,32,180,85,1.0
1,26,169,64,1.0
2,26,164,55,0.0
3,32,186,93,1.0
4,36,160,50,0.0


In [3]:
min_max_scaler = preprocessing.MinMaxScaler()
scaled_users = min_max_scaler.fit_transform(users[['Age', 'Height', 'Weight', 'Gender']])
user_distances = pairwise_distances(scaled_users)

# Data

Using the 'fall' data: 

    | 10 | FOL   | Forward-lying      | 3      | 10s      | Fall Forward from standing, use of hands to dampen fall |
    | 11 | FKL   | Front-knees-lying  | 3      | 10s      | Fall forward from standing, first impact on knees       |
    | 12 | BSC   | Back-sitting-chair | 3      | 10s      | Fall backward while trying to sit on a chair            |
    | 13 | SDL   | Sideward-lying     | 3      | 10s      | Fall sidewards from standing, bending legs              |

Seems that there are 3 files per user. Maybe for train/test/valid

In [4]:
MOBI_PATH = "../datasets/MobiAct_Dataset_v2.0/Annotated Data/"
FALLS = ("FOL", "FKL", "BSC", "SDL")
USERS = range(1,68) 

In [5]:
def get_sample(user_nbr=1, fall="FOL", trial=1):
    path = MOBI_PATH+fall+'/'+fall+'_'+str(user_nbr)+'_'+str(trial)+'_annotated.csv'
    data = pd.read_csv(path, usecols=range(2, 12)) # I skipp the timestamps (ie identifiers)
    return data.iloc[:,:-1], data.iloc[:,-1] 

## Example for user 1, type FOL

In [6]:
X, Y = get_sample()

In [7]:
Y.head()

0    STD
1    STD
2    STD
3    STD
4    STD
Name: label, dtype: object

## tentative de classif par timestamp, juste pour vérifier

(un example correspond à un relevé à un instant, c'est un item dans la série...)

In [8]:
class_svc = SVC()
class_svc.fit(X,Y)
class_svc.score(X,Y)

0.99949315762797775

In [9]:
class_svc.score(*get_sample(trial=2)), class_svc.score(*get_sample(trial=3)) 

(0.24632539280283833, 0.27456940222897669)

## Let's subsample 

On prend aléatoirement uniformément 30 items dans [STD], dans [LYI], et 30 items dans [STD,FOL,LYI].
On calcule, min, max, median, mean, kurtosis, skew

In [10]:
def subsample_one(data, list_index, part="STD", subsize=30): 
    r = np.random.choice(list_index[part], subsize, replace=False)
    sub = data.iloc[r]
    df = pd.DataFrame(pd.concat([sub.mean(), sub.median(), sub.std(), sub.min(), sub.max(), sub.kurtosis(), sub.skew()], 
                                keys=["mean", "median", "std", "min", "max", "kurtosis", "skew"])).transpose()
    df.insert(len(df.columns), "label", part)
    return df

In [11]:
list_index = dict()
parts = ("STD", "FOL", "LYI")
for p in parts:
    list_index[p] = np.argwhere(Y==p)[:,0]
subsample_one(X, list_index)

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,median,...,skew,skew,skew,skew,skew,skew,skew,skew,skew,label
Unnamed: 0_level_1,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,acc_x,...,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,Unnamed: 21_level_1
0,0.958048,-9.551108,-1.339026,0.002026,0.027326,0.022633,14.291038,24.694449,-34.133662,0.985519,...,-1.489664,-1.420009,0.612114,-0.37614,0.232785,0.512152,-1.067463,-0.858359,-0.842114,STD


In [12]:
def subsample_one_mixed(data, list_index, parts=("STD", "FOL", "LYI"), subsize=30):
    sub = pd.DataFrame()
    for part in parts:
        r = np.random.choice(list_index[part], subsize//3, replace=False)
        sub = sub.append(data.iloc[r])
    df = pd.DataFrame(pd.concat([sub.mean(), sub.median(), sub.std(), sub.min(), sub.max(), sub.kurtosis(), sub.skew()], 
                                keys=["mean", "median", "std", "min", "max", "kurtosis", "skew"])).transpose()
    df.insert(len(df.columns), "label", parts[1])
    return df

In [13]:
def subsample_3_status(X, Y,nb=20, part="FOL", subsize=30):
    """ Subsample and compute stats in each part of the fall. Label each subsample with STD, LYI or the fall type. 
    For the fall type we take 1/3 of STD, 1/3 of LYI and 1/3 of the fall type. 
    """
    list_index = dict()
    df = pd.DataFrame()
    parts = ("STD", part, "LYI")
    for p in parts:
        list_index[p] = np.argwhere(Y==p)[:,0]
    for n in range(nb):
        df = df.append(subsample_one(X, list_index, parts[0], subsize), ignore_index=True)
        df = df.append(subsample_one(X, list_index, parts[2], subsize), ignore_index=True)
        df = df.append(subsample_one_mixed(X, list_index, parts, subsize), ignore_index=True)
    return df

In [14]:
df = subsample_3_status(X, Y, nb=10)
df.head()

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,median,...,skew,skew,skew,skew,skew,skew,skew,skew,skew,label
Unnamed: 0_level_1,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,acc_x,...,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,Unnamed: 21_level_1
0,0.919897,-9.561474,-1.30486,0.005742,0.009672,0.01854,13.969071,24.093475,-34.952802,0.981165,...,-0.638852,-1.564287,0.188629,-0.469203,0.845086,0.479874,-0.59113,-0.265491,-0.198435,STD
1,7.913071,-1.236224,5.410575,-0.000173,0.003523,0.008695,185.871774,6.761782,54.481111,7.923449,...,-0.504441,-0.315307,-0.491487,1.077394,-1.659816,-2.433636,-3.568497,0.105212,-4.463483,LYI
2,3.886621,-5.307311,1.889923,0.233452,0.040429,-0.130104,105.081594,21.566012,10.882259,2.017405,...,0.065525,0.072374,-0.162965,3.208071,3.818755,-2.890018,-0.151657,1.151875,-0.016778,FOL
3,0.913169,-9.542331,-1.315586,0.003187,0.018326,0.022134,13.849967,23.915461,-35.153427,0.978635,...,-0.569072,-2.234386,0.354858,-0.288348,0.514191,0.452181,-0.782806,-0.375171,-0.277697,STD
4,7.901692,-1.224928,5.426543,0.000998,-1e-05,0.009652,185.877197,6.770661,54.361232,7.908007,...,-3.338492,-0.253306,0.625435,-0.291009,-1.954634,-1.760711,-3.452433,0.099917,-3.341676,LYI


### Learn a SVC without normalization

In [15]:
class_svc = SVC()
X1, Y1 = df.iloc[:,:-1], df.iloc[:,-1] 
class_svc.fit(X1,Y1)
class_svc.score(X1,Y1)

1.0

In [16]:
df = subsample_3_status(X, Y, subsize=9)
X1, Y1 = df.iloc[:,:-1], df.iloc[:,-1] 
class_svc.score(X1,Y1)

0.91666666666666663

In [17]:
df = subsample_3_status(*get_sample(trial=2), subsize=30)
X1, Y1 = df.iloc[:,:-1], df.iloc[:,-1] 
class_svc.score(X1,Y1)

0.33333333333333331

### Normalize

In [18]:
def get_normalized_XY(df):
    X, Y = df.iloc[:,:-1], df.iloc[:,-1] 
    X = norm.fit_transform(X)
    return X,Y

In [19]:
norm = preprocessing.MinMaxScaler()
class_svc = SVC()

df = subsample_3_status(*get_sample(), nb=10)
X1, Y1 = get_normalized_XY(df)
X1_scaled = norm.fit_transform(X1)
class_svc.fit(X1,Y1)
class_svc.score(X1,Y1)

1.0

In [20]:
df = subsample_3_status(*get_sample(trial=2), nb=10)
X1, Y1 = get_normalized_XY(df)
class_svc.fit(X1,Y1)
class_svc.score(X1,Y1)

1.0

## Sub-sampling for different fall types

In [21]:
def subsample_4_types(user=1, nb=20, subsize=30, trials=[1,2], dist=None):
    """ nb is the number of times we subsample in one trial; subsize is the size of the subsample we draw, 
    dist is the number of each class in each trial"""
    df = pd.DataFrame()
    if dist is None:
        dist = {f:nb for f in FALLS}
        
    for fall in FALLS:
        for trial in trials:
            X, Y = get_sample(user_nbr=user, fall=fall, trial=trial)
            
            list_index = dict()
            parts = ("STD", fall, "LYI")
            for p in parts:
                list_index[p] = np.argwhere(Y==p)[:,0]
            for n in range(dist[fall]):
                df = df.append(subsample_one_mixed(X, list_index, parts, subsize), ignore_index=True)
    return df    

### Example of Sub-sampling

In [22]:
subsample_4_types(dist={'BSC': 5, 'FKL': 3, 'FOL': 10, 'SDL': 2}, trials=[2])

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,median,...,skew,skew,skew,skew,skew,skew,skew,skew,skew,label
Unnamed: 0_level_1,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,acc_x,...,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,Unnamed: 21_level_1
0,2.759072,-5.946145,3.503745,0.21137,0.406551,-0.020159,148.28125,50.541142,22.811359,3.454566,...,0.079908,0.609706,0.287229,2.754278,2.342445,1.35346,-0.256434,0.882517,-0.471886,FOL
1,3.020274,-7.20223,3.171492,0.249121,-0.300159,-0.092678,150.274027,56.425775,21.33673,1.099481,...,1.622343,0.228738,1.088346,2.936139,-3.874773,1.092403,-0.509952,0.609454,-0.500034,FOL
2,2.117222,-6.72885,2.317374,0.237362,0.104214,-0.153337,149.272751,56.585561,21.491008,-0.111673,...,0.176086,0.073096,0.213324,2.20506,4.674954,-2.556698,-0.806731,0.511565,-0.760922,FOL
3,2.671019,-6.328791,3.05196,-0.096792,0.488896,0.111076,148.908138,56.761798,21.142351,2.302552,...,0.655683,2.689447,-0.147705,-4.297621,4.679743,5.239328,-0.211842,0.479247,-0.556566,FOL
4,2.594136,-7.331604,2.65039,0.36601,0.098685,-0.494994,149.030291,60.522922,19.910041,0.87446,...,1.946567,-0.296174,0.867871,2.372118,4.622237,-5.007633,-0.466572,0.20913,-0.233215,FOL
5,4.310217,-5.479736,3.916674,0.283635,-0.184023,0.090001,149.080027,51.096835,23.812173,5.289357,...,1.271623,1.221213,0.432883,3.592997,-0.896729,1.277319,-0.207115,0.985446,-0.822117,FOL
6,1.722456,-7.052262,2.759202,0.119536,0.163916,-0.061779,149.71351,57.835068,19.983125,-0.051632,...,-1.335569,0.129875,0.611586,2.9156,3.360706,-3.341855,-0.539731,0.511467,-0.503506,FOL
7,2.193535,-6.566452,1.910903,0.242412,-0.156819,-0.099215,151.123795,59.698064,20.555553,0.337806,...,0.695092,2.61718,0.366257,2.301095,-4.071421,1.727441,-1.056995,0.357082,-0.282259,FOL
8,2.298473,-7.213874,2.375379,0.10741,-0.041651,-0.083383,150.224048,61.379152,19.614184,0.467746,...,0.241929,0.19462,0.225371,2.395483,2.605925,-2.433596,-0.864884,0.240009,-0.179939,FOL
9,2.714751,-6.683873,2.84192,0.171052,-0.199702,-0.133841,150.018884,60.710677,17.813266,2.956287,...,-0.280233,-0.205038,-0.037737,1.646332,-3.208558,-1.354844,-0.93642,0.359664,-0.525579,FOL


### Example of training a SVC

In [23]:
df = subsample_4_types(user=1, trials=[3])
X1, Y1 = get_normalized_XY(df)
class_svc.fit(X1,Y1)
class_svc.score(X1,Y1)

0.98750000000000004

In [24]:
index = df.columns

## Create a dataset

In [25]:
class Dataset():
    def __init__(self, user, df_train, df_test):
        self.user = user
        self.index = df_train.columns
        # Extract X, Y and normalize
        self.X_train, self.Y_train = get_normalized_XY(df_train)
        self.X_test, self.Y_test = get_normalized_XY(df_test)

    def test_to_dataframe(self):
        df = pd.DataFrame(self.X_test)
        df['label'] = self.Y_test
        df.columns = self.index
        return df
    
    def train_to_dataframe(self):
        df = pd.DataFrame(self.X_train)
        df['label'] = self.Y_train
        df.columns = self.index
        return df
    
    def save(self, dest_dir):
        self.train_to_dataframe().to_csv(os.path.join(dest_dir,"user{}_train.csv".format(self.user)))
        self.test_to_dataframe().to_csv(os.path.join(dest_dir,"user{}_test.csv".format(self.user)))
        
    def __repr__(self):
        return "user {}: train size is {}, test size is {}".format(self.user, len(self.X_train), len(self.X_test))

In [26]:
def create_dataset(low=2, high=10, nb_test=20):
    """Create a dataset with simple options, return 4 dicts users->train/test samples of descriptions and class"""
    datasets = list()

    for user in USERS:
        try: 
            nb_train = np.random.randint(low,high)
            # sample train and test
            df_train = subsample_4_types(user=user, nb=nb_train, trials=[1,2])
            df_test = subsample_4_types(user=user, nb=nb_test, trials=[3])
            # keep everything
            dataset=Dataset(user, df_train, df_test)
            datasets.append(dataset)
            print(dataset)
        except (FileNotFoundError, ValueError) as detail :
            print('Problem with user {}:  {}'.format(user, detail) )
    return datasets

In [27]:
datasets = create_dataset()

user 1: train size is 72, test size is 80
user 2: train size is 40, test size is 80
Problem with user 3:  File b'../datasets/MobiAct_Dataset_v2.0/Annotated Data/BSC/BSC_3_2_annotated.csv' does not exist
user 4: train size is 72, test size is 80
user 5: train size is 48, test size is 80
user 6: train size is 48, test size is 80
user 7: train size is 72, test size is 80
user 8: train size is 56, test size is 80
user 9: train size is 16, test size is 80
user 10: train size is 16, test size is 80
user 11: train size is 16, test size is 80
user 12: train size is 72, test size is 80
user 13: train size is 16, test size is 80
user 14: train size is 72, test size is 80
user 15: train size is 32, test size is 80
user 16: train size is 32, test size is 80
user 17: train size is 72, test size is 80
user 18: train size is 56, test size is 80
user 19: train size is 48, test size is 80
user 20: train size is 48, test size is 80
user 21: train size is 72, test size is 80
user 22: train size is 64, te

## Perfs with a linear SVC

### Local models

In [28]:
class_svc = SVC(kernel="linear")
allX_train = np.empty((0, 63))
allY_train = pd.Series()

for dataset in datasets:
    try: 
        # train
        class_svc.fit(dataset.X_train, dataset.Y_train)
        train_acc = class_svc.score(dataset.X_train, dataset.Y_train)
        # test
        test_acc = class_svc.score(dataset.X_test, dataset.Y_test)
        print("user {}, train:{}; test: {}".format(dataset.user,  train_acc, test_acc))
        # gather the train datasets for a global model
        allX_train = np.concatenate((allX_train, dataset.X_train))
        allY_train = allY_train.append(dataset.Y_train)       
    except:
        print('user {}: problem'.format(dataset.user) )

user 1, train:1.0; test: 0.85
user 2, train:0.975; test: 0.8875
user 4, train:1.0; test: 0.9
user 5, train:1.0; test: 0.95
user 6, train:1.0; test: 0.7125
user 7, train:1.0; test: 0.725
user 8, train:1.0; test: 0.8375
user 9, train:1.0; test: 0.85
user 10, train:1.0; test: 0.7375
user 11, train:1.0; test: 0.825
user 12, train:1.0; test: 0.75
user 13, train:1.0; test: 0.8375
user 14, train:1.0; test: 1.0
user 15, train:1.0; test: 0.8125
user 16, train:1.0; test: 0.825
user 17, train:1.0; test: 0.8875
user 18, train:1.0; test: 0.75
user 19, train:1.0; test: 0.75
user 20, train:1.0; test: 0.7
user 21, train:1.0; test: 0.55
user 22, train:1.0; test: 0.6625
user 23, train:1.0; test: 0.5875
user 25, train:1.0; test: 1.0
user 26, train:1.0; test: 0.8
user 27, train:1.0; test: 0.7625
user 28, train:1.0; test: 0.675
user 29, train:1.0; test: 0.7625
user 30, train:1.0; test: 0.9125
user 31, train:1.0; test: 0.5125
user 32, train:1.0; test: 0.7875
user 33, train:1.0; test: 0.975
user 34, train:1.

### Global model

In [29]:
class_svc.fit(allX_train, allY_train)
global_train_score = class_svc.score(allX_train, allY_train)
print("train: ", global_train_score)
for dataset in datasets: 
    try:
        global_test_score = class_svc.score(dataset.X_test, dataset.Y_test)
        print("test for user {}: {}".format(dataset.user, global_test_score))
    except:
        print('user {}: problem'.format(dataset.user))

train:  0.801880222841
test for user 1: 0.6375
test for user 2: 0.65
test for user 4: 0.625
test for user 5: 0.65
test for user 6: 0.75
test for user 7: 0.775
test for user 8: 0.6125
test for user 9: 0.675
test for user 10: 0.3375
test for user 11: 0.825
test for user 12: 0.8875
test for user 13: 0.725
test for user 14: 0.9375
test for user 15: 0.175
test for user 16: 0.8375
test for user 17: 0.925
test for user 18: 0.7125
test for user 19: 0.7875
test for user 20: 0.825
test for user 21: 0.7625
test for user 22: 0.725
test for user 23: 0.4375
test for user 25: 0.925
test for user 26: 0.3
test for user 27: 0.425
test for user 28: 0.5
test for user 29: 0.6625
test for user 30: 0.9125
test for user 31: 0.9
test for user 32: 0.925
test for user 33: 0.7
test for user 34: 0.7
test for user 35: 0.5125
test for user 36: 0.775
test for user 37: 0.8375
test for user 38: 0.825
test for user 40: 0.8
test for user 42: 0.8875
test for user 43: 0.7375
test for user 44: 0.7375
test for user 45: 0.95


# Save the dataset

In [30]:
import sys
import os
import shutil

In [31]:
dest_dir = "../datasets/Mobi_Generated"
if os.path.exists(dest_dir):
    shutil.rmtree(dest_dir)
os.makedirs(dest_dir)

for dataset in datasets:
    dataset.save(dest_dir)