# Load dataset

In [1]:
# Download & Unzip the dataset, unless it's already present
from pathlib import Path

url = 'https://www.vs.inf.ethz.ch/edu/HS2019/SE/xXjxkLul5TDrlSbniZFaIUu1gKUjZ2qj/example_data.zip'

if not Path("./example_data").is_dir():
  if not Path("./example_data.zip").is_file():
    !pip install wget
    import wget
    wget.download(url,'./example_data.zip')
  from zipfile import ZipFile
  ZipFile('example_data.zip','r').extractall()
print('Data ready')

Data ready


In [3]:
import pandas as pd

directory = 'example_data'

legs_df = pd.read_csv(directory + '/legs.csv', index_col=0)
acc_readings = pd.read_csv(directory + '/acc_readings.csv', index_col=0)
locations_scans = pd.read_csv(directory + '/locations_scans.csv', index_col=0)

bluetooth_scans = pd.read_csv(directory + '/bluetooth_scans.csv', index_col=0)
gyro_readings = pd.read_csv(directory + '/gyro_readings.csv', index_col=0)
magn_readings = pd.read_csv(directory + '/magn_readings.csv', index_col=0)
wifi_scans = pd.read_csv(directory + '/wifi_scans.csv', index_col=0)

print('LEGS:')
print(legs_df.head(1))

print('ACC:')
print(acc_readings.head(1))
      
print('LOC:')
print(locations_scans.head(1))
      
print('BT:')
print(bluetooth_scans.head(1))
      
print('GYRO:')
print(gyro_readings.head(1))
      
print('MAGN:')
print(magn_readings.head(1))
      
print('WIFI:')
print(wifi_scans.head(1))


  mask |= (ar1 == a)


LEGS:
            end    id   mode         start  \
0  1.558109e+12  88.0  605.0  1.558108e+12   

                                   user  
0  408483c0-4287-456d-a35c-b14e28ce10ec  
ACC:
   acc  leg       reading                                  user        x  \
0    3   88  1.558108e+12  408483c0-4287-456d-a35c-b14e28ce10ec  0.00639   

          y         z  
0  0.030259  0.008629  
LOC:
   acc         alt  bearing        lat  leg       lng       reading  speed  \
0  3.0  492.416046        0  47.324327   88  8.530802  1.558108e+12    0.0   

                                   user  
0  408483c0-4287-456d-a35c-b14e28ce10ec  
BT:
   bond  leg                                           mac  major  minor  \
0    10   88  i9+Jk4aeNDTI5bh92RJ5qNJ3TpTWQfGkcNmElIRuBb0=   7936   7936   

   reading_time  scan  signal  type                                  user  
0  1.558108e+12  1987     -92     2  408483c0-4287-456d-a35c-b14e28ce10ec  
GYRO:
   acc  leg       reading                         

# Calculate windows and features

In [8]:
#Run this cell to window the dataset. Otherwise jump to the following cell to load windowed data.

import numpy as np

#window size in ms
window_size = 5000

# dataframe containing the features.
# Columns: mean acceleration, max speed, transportation mode (target variable)
features_df = pd.DataFrame()

# iterate over all legs
for index, row in legs_df.iterrows():
    print('windowing', row['user'], row['id'])
    boundary_left = row['start']
    boundary_right = boundary_left + window_size
    
    while boundary_right < row['end']:
        features = {}
        
        # get accelerometer readings within window
        acc_readings_in_window = acc_readings.loc[
                (acc_readings['leg'] == row['id']) &
                (acc_readings['user'] == row['user']) &
                (acc_readings['reading'] > boundary_left) &
                (acc_readings['reading'] <= boundary_right)
            ].copy()
        
        # calculate magnitude
        acc_readings_in_window['magnitude'] = np.linalg.norm(acc_readings_in_window[['x','y','z']].values,axis=1)
        # save mean magnitude as feature
        features['acc_mean'] = acc_readings_in_window['magnitude'].mean()
        features['acc_max'] = acc_readings_in_window['magnitude'].max()
        features['acc_min'] = acc_readings_in_window['magnitude'].min()
        
        
        # get gyrometer readings within window
        gyro_readings_in_window = gyro_readings.loc[
                (gyro_readings['leg'] == row['id']) &
                (gyro_readings['user'] == row['user']) &
                (gyro_readings['reading'] > boundary_left) &
                (gyro_readings['reading'] <= boundary_right)
            ].copy()
        
        # calculate magnitude gyro 
        gyro_readings_in_window['magnitude'] = np.linalg.norm(gyro_readings_in_window[['x','y','z']].values,axis=1)
        # save mean magnitude as feature
        features['gyro_mean'] = gyro_readings_in_window['magnitude'].mean()
        features['gyro_max'] = gyro_readings_in_window['magnitude'].max()
        features['gyro_min'] = gyro_readings_in_window['magnitude'].min()
        
        
        # get location scans within window
        locations_scans_in_window = locations_scans.loc[
                (acc_readings['leg'] == row['id']) &
                (acc_readings['user'] == row['user']) &
                (acc_readings['reading'] > boundary_left) &
                (acc_readings['reading'] <= boundary_right)
            ].copy()
        
        # save max speed as feature
        features['max_speed'] = locations_scans_in_window['speed'].max()
        features['mean_speed'] = locations_scans_in_window['speed'].mean()
        features['min_speed'] = locations_scans_in_window['speed'].min()
        
        
        # save user, leg and transportation mode
        features['user'] = row['user']
        features['leg'] = row['id']
        features['mode'] = row['mode']
        
        # save features in features data frame
        features_df = features_df.append(features, ignore_index=True)
        
        # set new boundaries
        boundary_left = boundary_right
        boundary_right = boundary_left + window_size
        
print('Feature calculation done')
print(features_df.head())

features_df.to_pickle("./feature_df.pkl")

windowing 408483c0-4287-456d-a35c-b14e28ce10ec 88.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 89.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 90.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 91.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 92.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 93.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 94.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 95.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 96.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 97.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 98.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 99.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 100.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 101.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 102.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 103.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 104.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 105.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 106.0
windo

windowing fa7ab975-bd41-4c5a-82bb-1c5dd89c5a9f 35.0
windowing fa7ab975-bd41-4c5a-82bb-1c5dd89c5a9f 36.0
windowing fa7ab975-bd41-4c5a-82bb-1c5dd89c5a9f 37.0
windowing fa7ab975-bd41-4c5a-82bb-1c5dd89c5a9f 47.0
windowing fa7ab975-bd41-4c5a-82bb-1c5dd89c5a9f 51.0
windowing fa7ab975-bd41-4c5a-82bb-1c5dd89c5a9f 52.0
windowing fa7ab975-bd41-4c5a-82bb-1c5dd89c5a9f 53.0
windowing fa7ab975-bd41-4c5a-82bb-1c5dd89c5a9f 54.0
windowing fa7ab975-bd41-4c5a-82bb-1c5dd89c5a9f 55.0
windowing fa7ab975-bd41-4c5a-82bb-1c5dd89c5a9f 56.0
windowing fa7ab975-bd41-4c5a-82bb-1c5dd89c5a9f 60.0
windowing fa7ab975-bd41-4c5a-82bb-1c5dd89c5a9f 61.0
windowing fa7ab975-bd41-4c5a-82bb-1c5dd89c5a9f 62.0
windowing fa7ab975-bd41-4c5a-82bb-1c5dd89c5a9f 64.0
windowing fa7ab975-bd41-4c5a-82bb-1c5dd89c5a9f 65.0
windowing fa7ab975-bd41-4c5a-82bb-1c5dd89c5a9f 66.0
windowing fa7ab975-bd41-4c5a-82bb-1c5dd89c5a9f 67.0
windowing fa7ab975-bd41-4c5a-82bb-1c5dd89c5a9f 68.0
windowing fa7ab975-bd41-4c5a-82bb-1c5dd89c5a9f 69.0
windowing fa

In [6]:
#Run this of you want to load windowed dataset

import pandas as pd
import numpy as np

features_df = pd.read_pickle("./feature_df.pkl")
print(features_df.head())

     acc_max  acc_mean   acc_min  gyro_max  gyro_mean  gyro_min   leg  \
0   1.174978  0.198769  0.019773  0.105875   0.024877  0.000749  88.0   
1   2.255905  0.401202  0.045002  0.208818   0.056765  0.006705  88.0   
2  13.606654  2.993433  0.030300  1.038953   0.287379  0.007058  88.0   
3  18.359182  6.740988  0.573055  1.888106   0.696364  0.080618  88.0   
4  12.115353  2.910017  0.227392  0.919032   0.297306  0.021516  88.0   

   max_speed  mean_speed  min_speed   mode  \
0  13.224129    2.605083        0.0  605.0   
1  13.070034    4.980699        0.0  605.0   
2  12.591205    7.787395        0.0  605.0   
3  12.261044    4.702163        0.0  605.0   
4   7.451177    1.834214        0.0  605.0   

                                   user  
0  408483c0-4287-456d-a35c-b14e28ce10ec  
1  408483c0-4287-456d-a35c-b14e28ce10ec  
2  408483c0-4287-456d-a35c-b14e28ce10ec  
3  408483c0-4287-456d-a35c-b14e28ce10ec  
4  408483c0-4287-456d-a35c-b14e28ce10ec  


# Shuffle split

In [7]:
from sklearn.model_selection import StratifiedShuffleSplit

y = np.array(features_df['mode'])
X = np.array(features_df.drop(['user', 'leg', 'mode'], axis=1))
X = np.nan_to_num(X)

#groups = np.array(features_df['group'])

shuffle_split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)

train_indices, test_indices = next(shuffle_split.split(X=X, y=y))

X_train, X_test = X[train_indices], X[test_indices]
y_train, y_test = y[train_indices], y[test_indices]

print('Shape of X_train', X_train.shape)
print('Shape of y_train', y_train.shape)
print('Shape of X_test', X_test.shape)
print('Shape of y_test', y_test.shape)

Shape of X_train (18330, 9)
Shape of y_train (18330,)
Shape of X_test (7856, 9)
Shape of y_test (7856,)


In [8]:
import numpy as np
np.savetxt("X_datawindows.csv", X, delimiter=",")
np.savetxt("Y_datawindows.csv", y, delimiter=",")
print('Saved data windows')

Saved data windows


# Train XGBoost on training set

In [9]:
import xgboost as xgb

xgb_classifier = xgb.XGBClassifier(n_jobs=-1, objective='multi:softprob', random_state=42)

xgb_classifier.fit(X_train, y_train)

print('Training done')

Training done


# Make classifications on testing set

In [10]:
y_pred = xgb_classifier.predict(X_test)

print('Shape of y_pred', y_pred.shape)

Shape of y_pred (7856,)


# Print classification reports

In [11]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, digits=4))

# confusion matrix
confMatrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
print(confMatrix)

              precision    recall  f1-score   support

         2.0     0.7819    0.7839    0.7829      2698
         3.0     0.6939    0.0895    0.1585       380
         4.0     0.2857    0.0085    0.0166       234
         5.0     0.6515    0.4406    0.5257       488
       304.0     0.5450    0.8762    0.6720      1527
       601.0     0.6471    0.5218    0.5777      1286
       602.0     0.8077    0.1628    0.2710       129
       605.0     0.8597    0.9623    0.9081      1114

    accuracy                         0.6960      7856
   macro avg     0.6591    0.4807    0.4891      7856
weighted avg     0.6981    0.6960    0.6681      7856

Predicted  2.0    3.0    4.0    5.0    304.0  601.0  602.0  605.0
Actual                                                           
2.0         2115      1      1      1    392    161      2     25
3.0           35     34      0      7    284     20      0      0
4.0           34      0      2     37    115     41      0      5
5.0           21   

# Train model on whole set and export it

In [12]:
xgb_classifier.fit(X, y)
print('xgboost.model')
xgb_classifier.save_model('xgboost_model')
print('model exported')

xgboost.model
model exported


In [None]:
# Download file for use in Project
from google.colab import files
files.download('xgboost_model')