# Load dataset

In [1]:
# Download & unzip the dataset, unless it's already present
from pathlib import Path

url = 'https://www.vs.inf.ethz.ch/edu/HS2019/SE/xXjxkLul5TDrlSbniZFaIUu1gKUjZ2qj/example_data.zip'

if not Path("./example_data").is_dir():
  if not Path("./example_data.zip").is_file():
    !pip install wget
    import wget
    wget.download(url,'./example_data.zip')
  from zipfile import ZipFile
  ZipFile('example_data.zip','r').extractall()
print('Data ready')

Data ready


In [2]:
# Read data into Python

# Pandas is a popular library for managing and analysing data in Python.
# Google's course on Machine Learning has a short intro:
# See https://colab.research.google.com/notebooks/mlcc/intro_to_pandas.ipynb
import pandas as pd

directory = 'example_data'

legs_df = pd.read_csv(directory + '/legs.csv', index_col=0)
acc_readings = pd.read_csv(directory + '/acc_readings.csv', index_col=0)
locations_scans = pd.read_csv(directory + '/locations_scans.csv', index_col=0)
bluetooth_scans = pd.read_csv(directory + '/bluetooth_scans.csv', index_col=0)
wifi_scans = pd.read_csv(directory + '/wifi_scans.csv', index_col=0)
gyro_readings = pd.read_csv(directory + '/gyro_readings.csv', index_col=0)


# Print the first row of each file (quick sanity check)
print(legs_df.head(1))
print(acc_readings.head(1))
print(locations_scans.head(1))

  mask |= (ar1 == a)


            end    id   mode         start  \
0  1.558109e+12  88.0  605.0  1.558108e+12   

                                   user  
0  408483c0-4287-456d-a35c-b14e28ce10ec  
   acc  leg       reading                                  user        x  \
0    3   88  1.558108e+12  408483c0-4287-456d-a35c-b14e28ce10ec  0.00639   

          y         z  
0  0.030259  0.008629  
   acc         alt  bearing        lat  leg       lng       reading  speed  \
0  3.0  492.416046        0  47.324327   88  8.530802  1.558108e+12    0.0   

                                   user  
0  408483c0-4287-456d-a35c-b14e28ce10ec  





# Calculate windows and features

In [None]:
# Split data into windows for further processing

# NumPy is a popular library for fast numerical computations with large arrays and matrices.
# At https://colab.research.google.com/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/02.00-Introduction-to-NumPy.ipynb
# you can find an introduction to NumPy (Click on "Understanding Data Types in Python >" for the next page)
import numpy as np

#window size in ms
window_size = 10000

# dataframe containing the features.
# Columns: mean acceleration, max speed, transportation mode (target variable)
features_df = pd.DataFrame()

# iterate over all legs
for index, row in legs_df.iterrows():
    print('windowing', row['user'], row['id'])
    boundary_left = row['start']
    boundary_right = boundary_left + window_size
    previous_bluetooth = 0
    previous_wifi = 0
    
    while boundary_right < row['end']:
        features = {}
        
        
        # get accelerometer readings within window
        acc_readings_in_window = acc_readings.loc[
                (acc_readings['leg'] == row['id']) &
                (acc_readings['user'] == row['user']) &
                (acc_readings['reading'] > boundary_left) &
                (acc_readings['reading'] <= boundary_right)
            ].copy()
        
        # calculate magnitude
        acc_readings_in_window['magnitude'] = np.linalg.norm(acc_readings_in_window[['x','y','z']].values,axis=1)
        
        # save mean magnitude as feature
        features['acc_mean'] = acc_readings_in_window['magnitude'].mean()
        features['acc_x'] = acc_readings_in_window['x'].mean()
        features['acc_y'] = acc_readings_in_window['y'].mean()
        features['acc_z'] = acc_readings_in_window['z'].mean()
        
        
        # get location scans within window
        locations_scans_in_window = locations_scans.loc[
                (locations_scans['leg'] == row['id']) &
                (locations_scans['user'] == row['user']) &
                (locations_scans['reading'] > boundary_left) &
                (locations_scans['reading'] <= boundary_right)
            ].copy()
        
        # save max speed as feature
        features['max_speed'] = locations_scans_in_window['speed'].max()
        features['min_speed'] = locations_scans_in_window['speed'].min()
        features['mean_speed'] = locations_scans_in_window['speed'].mean()
        
        
        # Get Bluetooth
        bluetooth_scans_in_window = bluetooth_scans.loc[
                (bluetooth_scans['leg'] == row['id']) &
                (bluetooth_scans['user'] == row['user']) &
                (bluetooth_scans['reading_time'] > boundary_left) &
                (bluetooth_scans['reading_time'] <= boundary_right)
            ].copy()
    
        # Unique Bluetooth devices
        if (len(bluetooth_scans_in_window.mac) == 0): 
            if (bluetooth_scans_in_window['mac'].nunique() != 0):
                features['bluetooth_numbers'] = bluetooth_scans_in_window['mac'].nunique()
                previous_bluetooth = bluetooth_scans_in_window['mac'].nunique()
            else:
                features['bluetooth_numbers'] = previous_bluetooth 
        else:
            features['bluetooth_numbers'] = bluetooth_scans_in_window['mac'].nunique()
            previous_bluetooth = bluetooth_scans_in_window['mac'].nunique()
            
        
        # WIFI
        '''wifi_scans_in_window = wifi_scans.loc[
                (wifi_scans['leg'] == row['id']) &
                (wifi_scans['user'] == row['user']) &
                (wifi_scans['reading_time'] > boundary_left) &
                (wifi_scans['reading_time'] <= boundary_right)
            ].copy()
        
        if (len(wifi_scans_in_window.bssid) == 0):   
            print("EMPTY")
            if (wifi_scans_in_window['bssid'].nunique() != 0):
                features['wifi_numbers'] = wifi_scans_in_window['bssid'].nunique()
                previous_wifi = wifi_scans_in_window['bssid'].nunique()
                print(wifi_scans_in_window['bssid'].nunique())
            else:
                features['wifi_numbers'] = previous_wifi  
                print(previous_wifi)
        else:
            print("NOT EMPTY")
            features['wifi_numbers'] = wifi_scans_in_window['bssid'].nunique()
            previous_wifi = wifi_scans_in_window['bssid'].nunique()
            print(wifi_scans_in_window['bssid'].nunique())'''
                        
        
        #Get Gyro
        gyro_readings_in_window = gyro_readings.loc[
                (gyro_readings['leg'] == row['id']) &
                (gyro_readings['user'] == row['user']) &
                (gyro_readings['reading'] > boundary_left) &
                (gyro_readings['reading'] <= boundary_right)
            ].copy()
        
        # calculate magnitude
        gyro_readings_in_window['magnitude'] = np.linalg.norm(gyro_readings_in_window[['x','y','z']].values,axis=1)
        
        # save mean magnitude as feature
        features['gyro_mean'] = gyro_readings_in_window['magnitude'].mean()
        features['gyro_x'] = gyro_readings_in_window['x'].mean()
        features['gyro_y'] = gyro_readings_in_window['y'].mean()
        features['gyro_z'] = gyro_readings_in_window['z'].mean()
        
        
        # save user, leg and transportation mode
        features['user'] = row['user']
        features['leg'] = row['id']
        features['mode'] = row['mode']
        
        # save features in features data frame
        features_df = features_df.append(features, ignore_index=True)
        
        # set new boundaries
        boundary_left = boundary_right
        boundary_right = boundary_left + window_size
        
print('Feature calculation done')
print(features_df.head())

features_df.to_csv("window_test.csv")

windowing 408483c0-4287-456d-a35c-b14e28ce10ec 88.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 89.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 90.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 91.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 92.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 93.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 94.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 95.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 96.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 97.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 98.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 99.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 100.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 101.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 102.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 103.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 104.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 105.0
windowing 408483c0-4287-456d-a35c-b14e28ce10ec 106.0
windo

# Shuffle split data into training and test set

In [None]:
# Randomly split data into training and test set

# Scikit-Learn is a popular library for common machine learning algorithms.
# Here, it is used only for its support for efficiently splitting a dataset.
# For more information, see https://colab.research.google.com/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/05.02-Introducing-Scikit-Learn.ipynb
from sklearn.model_selection import StratifiedShuffleSplit

y = np.array(features_df['mode'])
X = np.array(features_df.drop(['user', 'leg', 'mode'], axis=1))
X = np.nan_to_num(X)

features_df2 = features_df.drop(['user', 'leg', 'mode'], axis=1)
features_df2.to_excel("Results/testn.xlsx")

shuffle_split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)

train_indices, test_indices = next(shuffle_split.split(X=X, y=y))

X_train, X_test = X[train_indices], X[test_indices]
y_train, y_test = y[train_indices], y[test_indices]

print('Shape of X_train', X_train.shape)
print('Shape of y_train', y_train.shape)
print('Shape of X_test', X_test.shape)
print('Shape of y_test', y_test.shape)

# Train XGBoost on training set

In [None]:
# Train Gradient Boosted Decisions Trees

#XGBoost implements the"Extreme Gradient Boosting" algorithm.
import xgboost as xgb

xgb_classifier = xgb.XGBClassifier(n_jobs=-1, objective='multi:softprob', random_state=42)

xgb_classifier.fit(X_train, y_train)

print('Training done')

# Make classifications on testing set

In [None]:
y_pred = xgb_classifier.predict(X_test)

print('Shape of y_pred', y_pred.shape)

# Print classification reports

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, digits=4))

# confusion matrix
confMatrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
print(confMatrix)

# Train model on whole set and export it

In [None]:
xgb_classifier.fit(X, y)
print('xgboost.model')
xgb_classifier.save_model('xgboost_model_test')
print('model exported')

In [None]:
import pandas as pd
import openpyxl

df = pd.read_csv('window.csv', index_col=False)
df.to_excel("Results/acc_mean.xlsx")

df.loc[df['mode'] == 2.0]
print(df.head())
df_acc_mean = df[['acc_mean', 'mode']]
df_acc_mean.loc[df_acc_mean['mode'] == '2.0']
df_acc_mean.dropna(axis=1)

print(df_acc_mean['acc_mean'].mean())


#df.loc[df['acc_mean'] == some_value]
