In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.svm import SVC
import os.path

norm = preprocessing.MinMaxScaler()

# Users

There are missing values for the gender (represented as '-'). I normalize the columns and compute the distance matrix.


In [2]:
USER_FILE = "../datasets/Mobi_Users.csv"

users = pd.read_csv(USER_FILE,usecols=(3, 4, 5, 6))
users.replace(('M', 'F', '-'),(1,0,0.5), inplace=True)
users.head()

Unnamed: 0,Age,Height,Weight,Gender
0,32,180,85,1
1,26,169,64,1
2,26,164,55,0
3,32,186,93,1
4,36,160,50,0


In [3]:
min_max_scaler = preprocessing.MinMaxScaler()
scaled_users = min_max_scaler.fit_transform(users[['Age', 'Height', 'Weight', 'Gender']])
user_distances = pairwise_distances(scaled_users)

# Data

Using the 'fall' data: 

    | 10 | FOL   | Forward-lying      | 3      | 10s      | Fall Forward from standing, use of hands to dampen fall |
    | 11 | FKL   | Front-knees-lying  | 3      | 10s      | Fall forward from standing, first impact on knees       |
    | 12 | BSC   | Back-sitting-chair | 3      | 10s      | Fall backward while trying to sit on a chair            |
    | 13 | SDL   | Sideward-lying     | 3      | 10s      | Fall sidewards from standing, bending legs              |

Seems that there are 3 files per user. Maybe for train/test/valid

In [4]:
MOBI_PATH = "../datasets/MobiAct_Dataset_v2.0/Annotated Data/"
FALLS = ("FOL", "FKL", "BSC", "SDL")
USERS = range(1,68) 

In [5]:
def get_sample(user_nbr=1, fall="FOL", trial=1):
    path = MOBI_PATH+fall+'/'+fall+'_'+str(user_nbr)+'_'+str(trial)+'_annotated.csv'
    data = pd.read_csv(path, usecols=range(2, 12)) # I skipp the timestamps (ie identifiers)
    return data.iloc[:,:-1], data.iloc[:,-1] 

## Example for user 1, type FOL

In [6]:
X, Y = get_sample()

In [7]:
Y.head()

0    STD
1    STD
2    STD
3    STD
4    STD
Name: label, dtype: object

## tentative de classif par timestamp, juste pour vérifier

(un example correspond à un relevé à un instant, c'est un item dans la série...)

In [8]:
class_svc = SVC()
class_svc.fit(X,Y)
class_svc.score(X,Y)

0.99949315762797775

In [9]:
class_svc.score(*get_sample(trial=2)), class_svc.score(*get_sample(trial=3)) 

(0.24632539280283833, 0.27456940222897669)

## Let's subsample 

On prend aléatoirement uniformément 30 items dans [STD], dans [LYI], et 30 items dans [STD,FOL,LYI].
On calcule, min, max, median, mean, kurtosis, skew

In [10]:
def subsample_one(data, list_index, part="STD", subsize=30): 
    r = np.random.choice(list_index[part], subsize, replace=False)
    sub = data.iloc[r]
    df = pd.DataFrame(pd.concat([sub.mean(), sub.median(), sub.std(), sub.min(), sub.max(), sub.kurtosis(), sub.skew()], 
                                keys=["mean", "median", "std", "min", "max", "kurtosis", "skew"])).transpose()
    df.insert(len(df.columns), "label", part)
    return df

In [11]:
list_index = dict()
parts = ("STD", "FOL", "LYI")
for p in parts:
    list_index[p] = np.argwhere(Y==p)[:,0]
subsample_one(X, list_index)

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,median,...,skew,skew,skew,skew,skew,skew,skew,skew,skew,label
Unnamed: 0_level_1,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,acc_x,...,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,Unnamed: 21_level_1
0,0.907795,-9.532425,-1.330183,0.007646,0.015954,0.016554,13.258369,23.060572,-36.154591,0.893262,...,-0.224251,-1.119896,0.259626,-0.185773,0.668342,0.941835,-0.741449,-0.274564,-0.094565,STD


In [12]:
def subsample_one_mixed(data, list_index, parts=("STD", "FOL", "LYI"), subsize=30):
    sub = pd.DataFrame()
    for part in parts:
        r = np.random.choice(list_index[part], subsize//3, replace=False)
        sub = sub.append(data.iloc[r])
    df = pd.DataFrame(pd.concat([sub.mean(), sub.median(), sub.std(), sub.min(), sub.max(), sub.kurtosis(), sub.skew()], 
                                keys=["mean", "median", "std", "min", "max", "kurtosis", "skew"])).transpose()
    df.insert(len(df.columns), "label", parts[1])
    return df

In [13]:
def subsample_3_status(X, Y,nb=20, part="FOL", subsize=30):
    """ Subsample and compute stats in each part of the fall. Label each subsample with STD, LYI or the fall type. 
    For the fall type we take 1/3 of STD, 1/3 of LYI and 1/3 of the fall type. 
    """
    list_index = dict()
    df = pd.DataFrame()
    parts = ("STD", part, "LYI")
    for p in parts:
        list_index[p] = np.argwhere(Y==p)[:,0]
    for n in range(nb):
        df = df.append(subsample_one(X, list_index, parts[0], subsize), ignore_index=True)
        df = df.append(subsample_one(X, list_index, parts[2], subsize), ignore_index=True)
        df = df.append(subsample_one_mixed(X, list_index, parts, subsize), ignore_index=True)
    return df

In [14]:
df = subsample_3_status(X, Y, nb=10)
df.head()

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,median,...,skew,skew,skew,skew,skew,skew,skew,skew,skew,label
Unnamed: 0_level_1,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,acc_x,...,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,Unnamed: 21_level_1
0,0.934705,-9.553718,-1.339416,-0.005396,0.027581,0.022602,13.860265,24.098892,-34.852351,0.979252,...,-0.956306,-1.351137,0.67332,-0.281466,0.426836,0.547684,-1.152358,-0.787722,-0.661338,STD
1,7.923596,-1.214669,5.419357,-0.000143,0.006088,0.009479,186.220047,6.828242,54.634771,7.916865,...,3.33038,-0.301911,-1.102407,0.468369,2.09854,0.270623,-3.571111,-0.34698,-3.690234,LYI
2,5.2433,-4.162903,2.593481,0.109915,0.100487,-0.250404,107.359726,16.254001,13.579076,5.384418,...,0.778998,0.951527,0.058184,1.739003,2.149368,-3.307598,-0.271569,0.98858,-0.261145,FOL
3,0.938918,-9.548176,-1.308829,0.007371,0.016962,0.014111,14.000217,24.175374,-34.791728,0.979416,...,-1.114188,-1.865045,-0.211197,-2.434813,0.379096,0.975992,-1.277285,-0.621439,-0.405837,STD
4,7.918872,-1.22902,5.407739,0.000794,0.001537,0.007687,185.680545,6.843501,54.338152,7.916158,...,1.697177,-0.167359,-1.588543,0.426616,-2.368239,-2.826797,-3.228258,-0.724259,-3.112659,LYI


### Learn a SVC without normalization

In [15]:
class_svc = SVC()
X1, Y1 = df.iloc[:,:-1], df.iloc[:,-1] 
class_svc.fit(X1,Y1)
class_svc.score(X1,Y1)

1.0

In [16]:
df = subsample_3_status(X, Y, subsize=9)
X1, Y1 = df.iloc[:,:-1], df.iloc[:,-1] 
class_svc.score(X1,Y1)

0.76666666666666672

In [17]:
df = subsample_3_status(*get_sample(trial=2), subsize=30)
X1, Y1 = df.iloc[:,:-1], df.iloc[:,-1] 
class_svc.score(X1,Y1)

0.33333333333333331

### Normalize

In [18]:
def get_normalized_XY(df):
    X, Y = df.iloc[:,:-1], df.iloc[:,-1] 
    X = norm.fit_transform(X)
    return X,Y

In [19]:
norm = preprocessing.MinMaxScaler()
class_svc = SVC()

df = subsample_3_status(*get_sample(), nb=10)
X1, Y1 = get_normalized_XY(df)
X1_scaled = norm.fit_transform(X1)
class_svc.fit(X1,Y1)
class_svc.score(X1,Y1)

1.0

In [20]:
df = subsample_3_status(*get_sample(trial=2), nb=10)
X1, Y1 = get_normalized_XY(df)
class_svc.fit(X1,Y1)
class_svc.score(X1,Y1)

1.0

## Sub-sampling for different fall types

In [21]:
def subsample_4_types(user=1, nb=20, subsize=30, trials=[1,2], dist=None):
    """ nb is the number of times we subsample in one trial; subsize is the size of the subsample we draw, 
    dist is the number of each class in each trial"""
    df = pd.DataFrame()
    if dist is None:
        dist = {f:nb for f in FALLS}
        
    for fall in FALLS:
        for trial in trials:
            X, Y = get_sample(user_nbr=user, fall=fall, trial=trial)
            
            list_index = dict()
            parts = ("STD", fall, "LYI")
            for p in parts:
                list_index[p] = np.argwhere(Y==p)[:,0]
            for n in range(dist[fall]):
                df = df.append(subsample_one_mixed(X, list_index, parts, subsize), ignore_index=True)
    return df    

### Example of Sub-sampling

In [22]:
subsample_4_types(dist={'BSC': 5, 'FKL': 3, 'FOL': 10, 'SDL': 2}, trials=[2])

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,median,...,skew,skew,skew,skew,skew,skew,skew,skew,skew,label
Unnamed: 0_level_1,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,acc_x,...,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,Unnamed: 21_level_1
0,2.471433,-6.656042,2.444953,0.017094,-0.270735,0.111676,150.793127,56.066634,20.34703,0.163383,...,0.610019,0.405442,0.431436,0.218938,-2.943277,2.072042,-0.977163,0.568446,-0.346399,FOL
1,2.950041,-6.183433,2.434565,0.322863,-0.106851,0.033404,152.939064,54.115308,22.22049,0.793215,...,1.785613,1.453608,0.574234,2.442147,-0.862557,3.046484,-0.905409,0.674425,-0.342293,FOL
2,2.706248,-6.980378,2.783843,0.141955,0.060007,-0.005019,148.946217,59.305651,19.357415,0.745904,...,1.352729,0.048458,0.902543,3.762141,4.966659,3.102231,-0.279848,0.482242,-0.560909,FOL
3,2.63349,-6.554262,2.868231,0.203825,0.008888,0.028181,149.588973,53.289515,21.563829,2.131003,...,0.005923,-0.148245,0.132265,2.077918,3.666627,0.768542,-0.431515,0.826525,-0.495407,FOL
4,2.358377,-6.581684,3.094183,-0.002535,0.129473,0.056617,150.440176,55.586882,20.120016,0.358925,...,0.499388,-0.007931,0.782356,0.018354,1.767467,2.355327,-0.779858,0.51105,-0.143262,FOL
5,1.904992,-6.838619,1.833276,0.218008,-0.116747,-0.186568,149.043765,64.310686,17.175944,0.214365,...,0.451261,0.515871,0.512991,2.220623,-4.34138,-2.613026,-0.494439,0.147,-0.38898,FOL
6,2.527667,-6.626004,2.604221,0.122316,0.001843,-0.095183,150.495657,61.991001,19.203352,0.453849,...,0.192687,1.75111,0.21361,3.085986,0.881842,-1.783965,-0.708427,0.227432,-0.173314,FOL
7,1.984748,-6.673555,2.813295,0.174433,0.213691,-0.142026,149.682444,54.68731,22.328781,0.940066,...,-0.577803,-0.097228,0.490529,1.554199,3.22904,-2.768395,-0.853584,0.627176,-0.473924,FOL
8,2.574693,-5.919696,2.7613,0.179747,-0.336607,-0.063866,149.256351,55.347661,23.027362,0.627132,...,0.897998,3.129803,0.809657,1.355028,-2.849402,1.177921,-0.616852,0.599552,-0.363262,FOL
9,2.947857,-6.591012,3.054997,-0.050091,-0.080838,0.10797,149.298881,52.263351,22.069769,3.810156,...,0.352952,-0.080099,-0.133504,-1.573654,-2.051937,3.031256,-0.964525,0.798659,-0.537547,FOL


### Example of training a SVC

In [23]:
df = subsample_4_types(user=1, trials=[3])
X1, Y1 = get_normalized_XY(df)
class_svc.fit(X1,Y1)
class_svc.score(X1,Y1)

1.0

In [24]:
index = df.columns

## Create a dataset

In [25]:
class Dataset():
    def __init__(self, user, df_train, df_test):
        self.user = user
        self.index = df_train.columns
        # Extract X, Y and normalize
        self.X_train, self.Y_train = get_normalized_XY(df_train)
        self.X_test, self.Y_test = get_normalized_XY(df_test)

    def test_to_dataframe(self):
        df = pd.DataFrame(self.X_test)
        df['label'] = self.Y_test
        df.columns = self.index
        return df
    
    def train_to_dataframe(self):
        df = pd.DataFrame(self.X_train)
        df['label'] = self.Y_train
        df.columns = self.index
        return df
    
    def save(self, dest_dir):
        self.train_to_dataframe().to_csv(os.path.join(dest_dir,"user{}_train.csv".format(self.user)))
        self.test_to_dataframe().to_csv(os.path.join(dest_dir,"user{}_test.csv".format(self.user)))
        
    def __repr__(self):
        return "user {}: train size is {}, test size is {}".format(self.user, len(self.X_train), len(self.X_test))

In [28]:
def create_dataset(low=2, high=10, nb_test=20):
    """Create a dataset with simple options, return 4 dicts users->train/test samples of descriptions and class"""
    datasets = list()

    for user in USERS:
        try: 
            nb_train = np.random.randint(low,high)
            # sample train and test
            df_train = subsample_4_types(user=user, nb=nb_train, trials=[1,2])
            df_test = subsample_4_types(user=user, nb=nb_test, trials=[3])
            # keep everything
            dataset=Dataset(user, df_train, df_test)
            datasets.append(dataset)
            print(dataset)
        except (FileNotFoundError, ValueError) as detail :
            print('Problem with user {}:  {}'.format(user, detail) )
    return datasets

In [29]:
datasets = create_dataset()

user 1: train size is 40, test size is 80
user 2: train size is 40, test size is 80
user 3: train size is 32, test size is 80
user 4: train size is 48, test size is 80
user 5: train size is 72, test size is 80
user 6: train size is 32, test size is 80
user 7: train size is 72, test size is 80
user 8: train size is 64, test size is 80
user 9: train size is 32, test size is 80
user 10: train size is 56, test size is 80
user 11: train size is 16, test size is 80
user 12: train size is 16, test size is 80
user 13: train size is 40, test size is 80
user 14: train size is 48, test size is 80
user 15: train size is 40, test size is 80
user 16: train size is 56, test size is 80
user 17: train size is 32, test size is 80
user 18: train size is 56, test size is 80
user 19: train size is 16, test size is 80
user 20: train size is 32, test size is 80
user 21: train size is 40, test size is 80
user 22: train size is 32, test size is 80
user 23: train size is 56, test size is 80


OSError: File b'../datasets/MobiAct_Dataset_v2.0/Annotated Data/FOL/FOL_24_1_annotated.csv' does not exist

## Perfs with a linear SVC

### Local models

In [28]:
class_svc = SVC(kernel="linear")
allX_train = np.empty((0, 63))
allY_train = pd.Series()

for dataset in datasets:
    try: 
        # train
        class_svc.fit(dataset.X_train, dataset.Y_train)
        train_acc = class_svc.score(dataset.X_train, dataset.Y_train)
        # test
        test_acc = class_svc.score(dataset.X_test, dataset.Y_test)
        print("user {}, train:{}; test: {}".format(dataset.user,  train_acc, test_acc))
        # gather the train datasets for a global model
        allX_train = np.concatenate((allX_train, dataset.X_train))
        allY_train = allY_train.append(dataset.Y_train)       
    except:
        print('user {}: problem'.format(dataset.user) )

user 1, train:1.0; test: 0.85
user 2, train:0.975; test: 0.8875
user 4, train:1.0; test: 0.9
user 5, train:1.0; test: 0.95
user 6, train:1.0; test: 0.7125
user 7, train:1.0; test: 0.725
user 8, train:1.0; test: 0.8375
user 9, train:1.0; test: 0.85
user 10, train:1.0; test: 0.7375
user 11, train:1.0; test: 0.825
user 12, train:1.0; test: 0.75
user 13, train:1.0; test: 0.8375
user 14, train:1.0; test: 1.0
user 15, train:1.0; test: 0.8125
user 16, train:1.0; test: 0.825
user 17, train:1.0; test: 0.8875
user 18, train:1.0; test: 0.75
user 19, train:1.0; test: 0.75
user 20, train:1.0; test: 0.7
user 21, train:1.0; test: 0.55
user 22, train:1.0; test: 0.6625
user 23, train:1.0; test: 0.5875
user 25, train:1.0; test: 1.0
user 26, train:1.0; test: 0.8
user 27, train:1.0; test: 0.7625
user 28, train:1.0; test: 0.675
user 29, train:1.0; test: 0.7625
user 30, train:1.0; test: 0.9125
user 31, train:1.0; test: 0.5125
user 32, train:1.0; test: 0.7875
user 33, train:1.0; test: 0.975
user 34, train:1.

### Global model

In [30]:
class_svc.fit(allX_train, allY_train)
global_train_score = class_svc.score(allX_train, allY_train)
print("train: ", global_train_score)
for dataset in datasets: 
    try:
        global_test_score = class_svc.score(dataset.X_test, dataset.Y_test)
        print("test for user {}: {}".format(dataset.user, global_test_score))
    except:
        print('user {}: problem'.format(dataset.user))

NameError: name 'allX_train' is not defined

# Save the dataset

In [30]:
import sys
import os
import shutil

In [31]:
dest_dir = "../datasets/Mobi_Generated"
if os.path.exists(dest_dir):
    shutil.rmtree(dest_dir)
os.makedirs(dest_dir)

for dataset in datasets:
    dataset.save(dest_dir)