In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.svm import SVC
import os.path

norm = preprocessing.MinMaxScaler()

# Users

There are missing values for the gender (represented as '-'). I normalize the columns and compute the distance matrix.


In [2]:
USER_FILE = "../datasets/Mobi_Users.csv"

users = pd.read_csv(USER_FILE,usecols=(3, 4, 5, 6))
users.replace(('M', 'F', '-'),(1,0,0.5), inplace=True)
users.head()

Unnamed: 0,Age,Height,Weight,Gender
0,32,180,85,1
1,26,169,64,1
2,26,164,55,0
3,32,186,93,1
4,36,160,50,0


In [35]:
min_max_scaler = preprocessing.MinMaxScaler()
scaled_users = min_max_scaler.fit_transform(users[['Age', 'Height', 'Weight', 'Gender']])
user_distances = pairwise_distances(scaled_users)
print(user_distances, user_distances.shape)

[[ 0.          0.48801458  1.20084794 ...,  0.46405012  0.57991928
   0.42099116]
 [ 0.48801458  0.          1.01830191 ...,  0.73086769  0.7702339
   0.33609214]
 [ 1.20084794  1.01830191  0.         ...,  1.30824864  1.37692169
   1.11835282]
 ..., 
 [ 0.46405012  0.73086769  1.30824864 ...,  0.          1.01956519
   0.83684588]
 [ 0.57991928  0.7702339   1.37692169 ...,  1.01956519  0.          0.45005529]
 [ 0.42099116  0.33609214  1.11835282 ...,  0.83684588  0.45005529  0.        ]] (67, 67)


# Data

Using the 'fall' data: 

    | 10 | FOL   | Forward-lying      | 3      | 10s      | Fall Forward from standing, use of hands to dampen fall |
    | 11 | FKL   | Front-knees-lying  | 3      | 10s      | Fall forward from standing, first impact on knees       |
    | 12 | BSC   | Back-sitting-chair | 3      | 10s      | Fall backward while trying to sit on a chair            |
    | 13 | SDL   | Sideward-lying     | 3      | 10s      | Fall sidewards from standing, bending legs              |

Seems that there are 3 files per user. Maybe for train/test/valid

In [4]:
MOBI_PATH = "../datasets/MobiAct_Dataset_v2.0/Annotated Data/"
FALLS = ("FOL", "FKL", "BSC", "SDL")
USERS = range(1,68) 

In [5]:
def get_sample(user_nbr=1, fall="FOL", trial=1):
    path = MOBI_PATH+fall+'/'+fall+'_'+str(user_nbr)+'_'+str(trial)+'_annotated.csv'
    data = pd.read_csv(path, usecols=range(2, 12)) # I skipp the timestamps (ie identifiers)
    return data.iloc[:,:-1], data.iloc[:,-1] 

## Example for user 1, type FOL

In [6]:
X, Y = get_sample()

In [7]:
Y.head()

0    STD
1    STD
2    STD
3    STD
4    STD
Name: label, dtype: object

## tentative de classif par timestamp, juste pour vérifier

(un example correspond à un relevé à un instant, c'est un item dans la série...)

In [8]:
class_svc = SVC()
class_svc.fit(X,Y)
class_svc.score(X,Y)

0.99949315762797775

In [9]:
class_svc.score(*get_sample(trial=2)), class_svc.score(*get_sample(trial=3)) 

(0.24632539280283833, 0.27456940222897669)

## Let's subsample 

On prend aléatoirement uniformément 30 items dans [STD], dans [LYI], et 30 items dans [STD,FOL,LYI].
On calcule, min, max, median, mean, kurtosis, skew

In [10]:
def subsample_one(data, list_index, part="STD", subsize=30): 
    r = np.random.choice(list_index[part], subsize, replace=False)
    sub = data.iloc[r]
    df = pd.DataFrame(pd.concat([sub.mean(), sub.median(), sub.std(), sub.min(), sub.max(), sub.kurtosis(), sub.skew()], 
                                keys=["mean", "median", "std", "min", "max", "kurtosis", "skew"])).transpose()
    df.insert(len(df.columns), "label", part)
    return df

In [11]:
list_index = dict()
parts = ("STD", "FOL", "LYI")
for p in parts:
    list_index[p] = np.argwhere(Y==p)[:,0]
subsample_one(X, list_index)

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,median,...,skew,skew,skew,skew,skew,skew,skew,skew,skew,label
Unnamed: 0_level_1,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,acc_x,...,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,Unnamed: 21_level_1
0,0.919164,-9.569818,-1.321126,0.0011,0.01179,0.017349,13.760168,23.869664,-35.173564,0.972631,...,-0.563818,-1.574362,0.383674,-0.371411,0.709066,0.872427,-0.64966,-0.355123,-0.252366,STD


In [12]:
def subsample_one_mixed(data, list_index, parts=("STD", "FOL", "LYI"), subsize=30):
    sub = pd.DataFrame()
    for part in parts:
        r = np.random.choice(list_index[part], subsize//3, replace=False)
        sub = sub.append(data.iloc[r])
    df = pd.DataFrame(pd.concat([sub.mean(), sub.median(), sub.std(), sub.min(), sub.max(), sub.kurtosis(), sub.skew()], 
                                keys=["mean", "median", "std", "min", "max", "kurtosis", "skew"])).transpose()
    df.insert(len(df.columns), "label", parts[1])
    return df

In [13]:
def subsample_3_status(X, Y,nb=20, part="FOL", subsize=30):
    """ Subsample and compute stats in each part of the fall. Label each subsample with STD, LYI or the fall type. 
    For the fall type we take 1/3 of STD, 1/3 of LYI and 1/3 of the fall type. 
    """
    list_index = dict()
    df = pd.DataFrame()
    parts = ("STD", part, "LYI")
    for p in parts:
        list_index[p] = np.argwhere(Y==p)[:,0]
    for n in range(nb):
        df = df.append(subsample_one(X, list_index, parts[0], subsize), ignore_index=True)
        df = df.append(subsample_one(X, list_index, parts[2], subsize), ignore_index=True)
        df = df.append(subsample_one_mixed(X, list_index, parts, subsize), ignore_index=True)
    return df

In [14]:
df = subsample_3_status(X, Y, nb=10)
df.head()

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,median,...,skew,skew,skew,skew,skew,skew,skew,skew,skew,label
Unnamed: 0_level_1,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,acc_x,...,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,Unnamed: 21_level_1
0,0.94118,-9.546622,-1.328299,0.005498,0.024048,0.020769,13.856541,23.979287,-35.030134,0.97901,...,-0.887653,-2.040978,0.310637,-0.77087,0.216988,0.542839,-0.97453,-0.609662,-0.483729,STD
1,7.907255,-1.23684,5.418122,0.001975,0.005152,0.010527,186.918277,6.819188,54.713996,7.911761,...,0.391279,0.296913,-0.271146,0.451346,-0.266559,-0.483228,-1.839624,-0.473636,-3.153487,LYI
2,3.918613,-5.621086,1.624355,0.281253,0.100111,-0.112939,98.955584,21.80336,8.694206,1.679729,...,0.358517,1.383899,0.271981,2.562237,5.195564,-0.778506,-0.006894,1.256155,0.091516,FOL
3,0.934886,-9.548511,-1.328098,0.004378,0.01684,0.023478,13.982088,24.173447,-34.776579,0.981044,...,-0.852279,-2.067043,0.408989,-2.285107,0.436736,0.250193,-1.17002,-0.710373,-0.549359,STD
4,7.907037,-1.220354,5.418479,-0.001038,0.00057,0.008206,185.876647,6.845664,54.413449,7.904068,...,1.052608,-1.124738,-1.50577,0.085253,-2.035886,-1.38847,-3.469223,-0.324084,-3.385238,LYI


### Learn a SVC without normalization

In [15]:
class_svc = SVC()
X1, Y1 = df.iloc[:,:-1], df.iloc[:,-1] 
class_svc.fit(X1,Y1)
class_svc.score(X1,Y1)

1.0

In [16]:
df = subsample_3_status(X, Y, subsize=9)
X1, Y1 = df.iloc[:,:-1], df.iloc[:,-1] 
class_svc.score(X1,Y1)

0.83333333333333337

In [17]:
df = subsample_3_status(*get_sample(trial=2), subsize=30)
X1, Y1 = df.iloc[:,:-1], df.iloc[:,-1] 
class_svc.score(X1,Y1)

0.33333333333333331

### Normalize

In [18]:
def get_normalized_XY(df):
    X, Y = df.iloc[:,:-1], df.iloc[:,-1] 
    X = norm.fit_transform(X)
    return X,Y

In [19]:
norm = preprocessing.MinMaxScaler()
class_svc = SVC()

df = subsample_3_status(*get_sample(), nb=10)
X1, Y1 = get_normalized_XY(df)
X1_scaled = norm.fit_transform(X1)
class_svc.fit(X1,Y1)
class_svc.score(X1,Y1)

1.0

In [20]:
df = subsample_3_status(*get_sample(trial=2), nb=10)
X1, Y1 = get_normalized_XY(df)
class_svc.fit(X1,Y1)
class_svc.score(X1,Y1)

1.0

## Sub-sampling for different fall types

In [21]:
def subsample_4_types(user=1, nb=20, subsize=30, trials=[1,2], dist=None):
    """ nb is the number of times we subsample in one trial; subsize is the size of the subsample we draw, 
    dist is the number of each class in each trial"""
    df = pd.DataFrame()
    if dist is None:
        dist = {f:nb for f in FALLS}
        
    for fall in FALLS:
        for trial in trials:
            X, Y = get_sample(user_nbr=user, fall=fall, trial=trial)
            
            list_index = dict()
            parts = ("STD", fall, "LYI")
            for p in parts:
                list_index[p] = np.argwhere(Y==p)[:,0]
            for n in range(dist[fall]):
                df = df.append(subsample_one_mixed(X, list_index, parts, subsize), ignore_index=True)
    return df    

### Example of Sub-sampling

In [22]:
subsample_4_types(dist={'BSC': 5, 'FKL': 3, 'FOL': 10, 'SDL': 2}, trials=[2])

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,median,...,skew,skew,skew,skew,skew,skew,skew,skew,skew,label
Unnamed: 0_level_1,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,acc_x,...,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,Unnamed: 21_level_1
0,2.417769,-6.766409,2.411367,0.319136,-0.179778,-0.104122,149.778264,61.775766,20.512962,1.085269,...,0.811425,1.627184,0.465924,1.807543,-5.337519,2.198007,-0.792501,0.161094,-0.358403,FOL
1,2.841081,-6.733122,2.650262,0.085796,-0.425844,-0.001466,150.278212,60.846959,19.379401,1.013027,...,1.112698,2.677507,1.076879,-1.969963,-3.229971,3.25659,-0.549405,0.257676,-0.200091,FOL
2,1.911701,-6.914617,1.823448,0.240233,-0.016096,-0.173262,151.615597,67.117608,16.238051,0.14901,...,0.442433,1.840719,0.526481,2.072188,1.629123,-3.205227,-0.82829,-0.096573,0.15087,FOL
3,1.897248,-7.162989,2.353869,0.102208,0.147778,-0.080858,149.299356,63.258369,16.757289,-0.112374,...,0.512223,0.333992,0.889924,4.656484,3.603439,-2.79734,-0.634726,0.20208,-0.441303,FOL
4,2.350564,-6.505378,3.550471,0.150578,0.09163,-0.203235,151.074324,53.772739,21.509066,0.656926,...,0.675723,0.088804,0.879518,-0.139841,-0.316576,-3.977278,-0.34585,0.66641,-0.468806,FOL
5,3.160041,-6.160636,3.163346,0.180063,0.192443,0.209527,150.203621,51.9512,21.460311,5.15883,...,0.265926,0.692144,0.215038,3.270778,2.197504,1.68895,-0.331245,0.890455,-0.348644,FOL
6,3.295353,-5.505698,3.344009,0.124566,-0.36097,0.164832,152.245658,56.854619,20.781388,1.33129,...,1.084582,2.657309,0.846303,1.9609,-3.452615,1.659879,-0.98686,0.552457,-0.305758,FOL
7,2.195803,-6.863329,2.22165,0.307174,-0.064864,-0.221754,149.884201,56.627907,22.083405,0.267886,...,0.195628,0.019527,0.192137,1.587318,-1.799336,-1.760221,-0.771836,0.508195,-0.437058,FOL
8,2.582918,-6.994193,2.697426,0.133423,0.309698,-0.007208,150.685173,59.943063,19.28884,0.540423,...,1.327835,0.12992,0.848411,2.910142,3.585556,4.083563,-0.491218,0.372864,-0.35253,FOL
9,2.20252,-6.629602,2.552903,-0.022531,0.046752,-0.064558,149.737852,53.304947,23.392294,0.913135,...,-0.427315,0.132049,0.110487,-3.138004,3.940437,-4.671779,-0.696111,0.849191,-0.670695,FOL


### Example of training a SVC

In [23]:
df = subsample_4_types(user=1, trials=[3])
X1, Y1 = get_normalized_XY(df)
class_svc.fit(X1,Y1)
class_svc.score(X1,Y1)

0.98750000000000004

In [24]:
index = df.columns

## Create a dataset

In [25]:
class Dataset():
    def __init__(self, user, df_train, df_test):
        self.user = user
        self.index = df_train.columns
        # Extract X, Y and normalize
        self.X_train, self.Y_train = get_normalized_XY(df_train)
        self.X_test, self.Y_test = get_normalized_XY(df_test)

    def test_to_dataframe(self):
        df = pd.DataFrame(self.X_test)
        df['label'] = self.Y_test
        df.columns = self.index
        return df
    
    def train_to_dataframe(self):
        df = pd.DataFrame(self.X_train)
        df['label'] = self.Y_train
        df.columns = self.index
        return df
    
    def save(self, dest_dir):
        self.train_to_dataframe().to_csv(os.path.join(dest_dir,"user{}_train.csv".format(self.user)))
        self.test_to_dataframe().to_csv(os.path.join(dest_dir,"user{}_test.csv".format(self.user)))
        
    def __repr__(self):
        return "user {}: train size is {}, test size is {}".format(self.user, len(self.X_train), len(self.X_test))

In [28]:
def create_dataset(low=2, high=10, nb_test=20):
    """Create a dataset with simple options, return 4 dicts users->train/test samples of descriptions and class"""
    datasets = list()

    for user in USERS:
        try: 
            nb_train = np.random.randint(low,high)
            # sample train and test
            df_train = subsample_4_types(user=user, nb=nb_train, trials=[1,2])
            df_test = subsample_4_types(user=user, nb=nb_test, trials=[3])
            # keep everything
            dataset=Dataset(user, df_train, df_test)
            datasets.append(dataset)
            print(dataset)
        except (FileNotFoundError, ValueError, OSError) as detail :
            print('Problem with user {}:  {}'.format(user, detail) )
    return datasets

In [29]:
datasets = create_dataset()

user 1: train size is 16, test size is 80
user 2: train size is 72, test size is 80
Problem with user 3:  File b'../datasets/MobiAct_Dataset_v2.0/Annotated Data/BSC/BSC_3_2_annotated.csv' does not exist
user 4: train size is 48, test size is 80
user 5: train size is 40, test size is 80
user 6: train size is 32, test size is 80
user 7: train size is 48, test size is 80
user 8: train size is 24, test size is 80
user 9: train size is 72, test size is 80
user 10: train size is 40, test size is 80
user 11: train size is 56, test size is 80
user 12: train size is 56, test size is 80
user 13: train size is 24, test size is 80
user 14: train size is 64, test size is 80
user 15: train size is 56, test size is 80
user 16: train size is 40, test size is 80
user 17: train size is 32, test size is 80
user 18: train size is 48, test size is 80
user 19: train size is 72, test size is 80
user 20: train size is 72, test size is 80
user 21: train size is 56, test size is 80
user 22: train size is 48, te

## Perfs with a linear SVC

### Local models

In [30]:
class_svc = SVC(kernel="linear")
allX_train = np.empty((0, 63))
allY_train = pd.Series()

for dataset in datasets:
    try: 
        # train
        class_svc.fit(dataset.X_train, dataset.Y_train)
        train_acc = class_svc.score(dataset.X_train, dataset.Y_train)
        # test
        test_acc = class_svc.score(dataset.X_test, dataset.Y_test)
        print("user {}, train:{}; test: {}".format(dataset.user,  train_acc, test_acc))
        # gather the train datasets for a global model
        allX_train = np.concatenate((allX_train, dataset.X_train))
        allY_train = allY_train.append(dataset.Y_train)       
    except:
        print('user {}: problem'.format(dataset.user) )

user 1, train:1.0; test: 0.9125
user 2, train:1.0; test: 0.875
user 4, train:1.0; test: 0.8625
user 5, train:1.0; test: 0.8375
user 6, train:1.0; test: 0.65
user 7, train:1.0; test: 0.7875
user 8, train:1.0; test: 0.9375
user 9, train:1.0; test: 0.7875
user 10, train:1.0; test: 0.6625
user 11, train:1.0; test: 0.8875
user 12, train:1.0; test: 0.75
user 13, train:1.0; test: 0.975
user 14, train:1.0; test: 0.9875
user 15, train:1.0; test: 0.8
user 16, train:1.0; test: 0.8375
user 17, train:1.0; test: 0.7625
user 18, train:1.0; test: 0.7625
user 19, train:1.0; test: 0.75
user 20, train:1.0; test: 0.55
user 21, train:1.0; test: 0.7
user 22, train:1.0; test: 0.6375
user 23, train:1.0; test: 0.6125
user 25, train:1.0; test: 1.0
user 26, train:1.0; test: 0.85
user 27, train:1.0; test: 0.75
user 28, train:1.0; test: 0.6125
user 29, train:1.0; test: 0.9875
user 30, train:0.9861111111111112; test: 0.975
user 31, train:1.0; test: 0.5
user 32, train:1.0; test: 0.7875
user 33, train:1.0; test: 0.93

### Global model

In [31]:
class_svc.fit(allX_train, allY_train)
global_train_score = class_svc.score(allX_train, allY_train)
print("train: ", global_train_score)
for dataset in datasets: 
    try:
        global_test_score = class_svc.score(dataset.X_test, dataset.Y_test)
        print("test for user {}: {}".format(dataset.user, global_test_score))
    except:
        print('user {}: problem'.format(dataset.user))

train:  0.771260997067
test for user 1: 0.525
test for user 2: 0.6
test for user 4: 0.5
test for user 5: 0.625
test for user 6: 0.7875
test for user 7: 0.8375
test for user 8: 0.65
test for user 9: 0.7125
test for user 10: 0.3875
test for user 11: 0.7375
test for user 12: 0.7
test for user 13: 0.7625
test for user 14: 0.9375
test for user 15: 0.325
test for user 16: 0.8375
test for user 17: 0.8625
test for user 18: 0.8625
test for user 19: 0.8
test for user 20: 0.75
test for user 21: 0.5625
test for user 22: 0.95
test for user 23: 0.3875
test for user 25: 0.9625
test for user 26: 0.1375
test for user 27: 0.475
test for user 28: 0.375
test for user 29: 0.7
test for user 30: 0.775
test for user 31: 0.8375
test for user 32: 0.8625
test for user 33: 0.85
test for user 34: 0.7375
test for user 35: 0.4
test for user 36: 0.925
test for user 37: 0.925
test for user 38: 0.775
test for user 40: 0.8625
test for user 42: 0.7875
test for user 43: 0.575
test for user 44: 0.975
test for user 45: 0.91

# Save the dataset

In [32]:
import sys
import os
import shutil

In [33]:
dest_dir = "../datasets/Mobi_Generated"
if os.path.exists(dest_dir):
    shutil.rmtree(dest_dir)
os.makedirs(dest_dir)

for dataset in datasets:
    dataset.save(dest_dir)