In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy.stats import bootstrap
import bootstrapped.bootstrap as bs
import bootstrapped.stats_functions as bs_stats

# (a) Download the AReM data from: https://archive.ics.uci.edu/ml/datasets/ Activity+Recognition+system+based+on+Multisensor+data+fusion+\%28AReM\ %29 . The dataset contains 7 folders that represent seven types of activities. In each folder, there are multiple files each of which represents an instant of a human performing an activity.1 Each file containis 6 time series collected from activitiesof the same person, which are called avg rss12, var rss12, avg rss13, var rss13, vg rss23, and ar rss23. There are 88 instances in the dataset, each of which con- tains 6 time series and each time series has 480 consecutive values.


> The corresponding data is downloaded and placed in the '../data' Directory 

# (b) Keep datasets 1 and 2 in folders bending1 and bending 2, as well as datasets 1, 2, and 3 in other folders as test data and other datasets as train data.

In [2]:
# folders = ['bending1', 'bending2', 'cycling', 'lying', 'sitting', 'standing', 'walking']
folders = {
    'bending1': 7,
    'bending2': 6,
    'cycling': 15,
    'lying' : 15,
    'sitting': 15,
    'standing': 15,
    'walking': 15  
}
testing_paths = []
training_paths = []
ordered_paths = []
for activity, _max in folders.items():
    for i in range(1, _max + 1):
        ordered_paths.append((f'../data/{activity}/dataset{i}.csv', activity))
        if i <= 2:
            testing_paths.append(f'../data/{activity}/dataset{i}.csv')    
        elif i == 3 and activity not in ['bending1', 'bending2']:
            testing_paths.append(f'../data/{activity}/dataset{i}.csv')
        else:
            training_paths.append(f'../data/{activity}/dataset{i}.csv')
            
columns = ['avg_rss12', 'var_rss12', 'avg_rss13', 'var_rss13', 'avg_rss23', 'var_rss23']
testing_dfs = [pd.read_csv(path, skiprows = 4).iloc[:, 1:] for path in testing_paths]
training_dfs = [pd.read_csv(path, skiprows = 4).iloc[:, 1:] for path in training_paths]
ordered_dfs = [pd.read_csv(path, skiprows = 4).iloc[:, 1:] for path, activity in ordered_paths]
for i in range(len(ordered_dfs)):
    activity = ordered_paths[i][1]
    ordered_dfs[i]['Label'] = activity
print(f"Number of instances {len(testing_dfs) + len(training_dfs)}")
print(f"Number of testing instances {len(testing_dfs)}")
print(f"Number of training instances {len(training_dfs)}")

Number of instances 88
Number of testing instances 19
Number of training instances 69


# (c) Feature Extraction Classification of time series usually needs extracting features from them. In this problem, we focus on time-domain features.

## i. Research what types of time-domain features are usually used in time series classification and list them (examples are minimum, maximum, mean, etc).

> Mean,
> Median, 
> Mode, 
> Standard Deviation,
> Variance,
> Covariance,
> Zero Cross Rate,
> Maximum value,
> Minimum value,
> Root Mean Square, 
> Distance feature = sqrt(sum of all the aforementioned metrics squared)

## ii. Extract the time-domain features minimum, maximum, mean, median, stan- dard deviation, first quartile, and third quartile for all of the 6 time series in each instance. You are free to normalize/standardize features or use them directly.2
    

In [3]:
cols = ["Instance", "Min(1)", "Max(1)", "Mean(1)", "Median(1)", "STD(1)", "Q1(1)", "Q3(1)",
           "Min(2)", "Max(2)", "Mean(2)", "Median(2)", "STD(2)", "Q1(2)", "Q3(2)",
           "Min(3)", "Max(3)", "Mean(3)", "Median(3)", "STD(3)", "Q1(3)", "Q3(3)",
           "Min(4)", "Max(4)", "Mean(4)", "Median(4)", "STD(4)", "Q1(4)", "Q3(4)",
           "Min(5)", "Max(5)", "Mean(5)", "Median(5)", "STD(5)", "Q1(5)", "Q3(5)",
           "Min(6)", "Max(6)", "Mean(6)", "Median(6)", "STD(6)", "Q1(6)", "Q3(6)",
           "Label"]
rows = []
for i, df in enumerate(ordered_dfs):
    temp = [i + 1]
    for col in df.columns[:-1]:
        temp = temp + [df[col].astype(float).min(), df[col].astype(float).max(), df[col].astype(float).mean(), df[col].astype(float).median(), df[col].astype(float).std(), df[col].astype(float).quantile(0.25), df[col].astype(float).quantile(0.75)]
    temp.append(df.iloc[2,-1])
    rows.append(temp)
time_domain_df = pd.DataFrame(rows,columns = cols)
time_domain_df.dropna(inplace=True)

In [4]:
time_domain_df.head(10)

Unnamed: 0,Instance,Min(1),Max(1),Mean(1),Median(1),STD(1),Q1(1),Q3(1),Min(2),Max(2),...,Q1(5),Q3(5),Min(6),Max(6),Mean(6),Median(6),STD(6),Q1(6),Q3(6),Label
0,1,37.25,45.0,40.624792,40.5,1.476967,39.25,42.0,0.0,1.3,...,33.0,36.0,0.0,1.92,0.570583,0.43,0.582915,0.0,1.3,bending1
1,2,38.0,45.67,42.812812,42.5,1.43555,42.0,43.67,0.0,1.22,...,32.0,34.5,0.0,3.11,0.571083,0.43,0.60101,0.0,1.3,bending1
2,3,35.0,47.4,43.9545,44.33,1.558835,43.0,45.0,0.0,1.7,...,35.3625,36.5,0.0,1.79,0.493292,0.43,0.513506,0.0,0.94,bending1
3,4,33.0,47.75,42.179813,43.5,3.670666,39.15,45.0,0.0,3.0,...,30.4575,36.33,0.0,2.18,0.613521,0.5,0.524317,0.0,1.0,bending1
4,5,33.0,45.75,41.678063,41.75,2.24349,41.33,42.75,0.0,2.83,...,28.4575,31.25,0.0,1.79,0.383292,0.43,0.389164,0.0,0.5,bending1
5,6,37.0,48.0,43.454958,43.25,1.386098,42.5,45.0,0.0,1.58,...,22.25,24.0,0.0,5.26,0.679646,0.5,0.622534,0.43,0.87,bending1
6,7,36.25,48.0,43.969125,44.5,1.618364,43.31,44.67,0.0,1.5,...,20.5,23.75,0.0,2.96,0.555313,0.49,0.487826,0.0,0.83,bending1
7,8,12.75,51.0,24.562958,24.25,3.737514,23.1875,26.5,0.0,6.87,...,20.5,27.0,0.0,4.97,0.700188,0.5,0.69372,0.43,0.87,bending2
8,9,0.0,42.75,27.464604,28.0,3.583582,25.5,30.0,0.0,7.76,...,15.0,20.75,0.0,6.76,1.122125,0.83,1.012342,0.47,1.3,bending2
9,10,21.0,50.0,32.586208,33.0,6.238143,26.1875,34.5,0.0,9.9,...,17.67,23.5,0.0,13.61,1.162042,0.83,1.33298,0.47,1.3,bending2


In [5]:
time_domain_df.tail(10)

Unnamed: 0,Instance,Min(1),Max(1),Mean(1),Median(1),STD(1),Q1(1),Q3(1),Min(2),Max(2),...,Q1(5),Q3(5),Min(6),Max(6),Mean(6),Median(6),STD(6),Q1(6),Q3(6),Label
78,79,21.5,51.0,34.935813,35.5,4.645944,32.0,38.0625,0.0,12.21,...,14.2375,18.25,0.0,10.21,3.280021,3.015,1.700918,2.12,4.5,walking
79,80,18.33,47.67,34.333042,34.75,4.94877,31.25,38.0,0.0,12.48,...,13.75,18.0,0.0,8.01,3.261583,2.98,1.61729,2.05,4.32,walking
80,81,18.33,45.75,34.599875,35.125,4.73179,31.5,38.0,0.0,15.37,...,14.0,18.25,0.0,8.86,3.289542,3.015,1.68017,2.12,4.26,walking
81,82,15.5,43.67,34.225875,34.75,4.441798,31.25,37.25,0.0,17.24,...,14.33,18.25,0.0,9.42,3.479542,3.27,1.761146,2.24,4.5375,walking
82,83,21.5,51.25,34.253521,35.0,4.940741,30.9375,37.75,0.0,13.55,...,13.75,18.0,0.0,8.32,3.50075,3.285,1.692378,2.18,4.5575,walking
83,84,19.5,45.33,33.586875,34.25,4.650935,30.25,37.0,0.0,14.67,...,13.73,18.25,0.0,8.32,3.259729,3.11,1.640243,2.05,4.3225,walking
84,85,19.75,45.5,34.32275,35.25,4.752477,31.0,38.0,0.0,13.47,...,13.5,17.75,0.0,9.67,3.432562,3.2,1.732727,2.1575,4.565,walking
85,86,19.5,46.0,34.546229,35.25,4.842294,31.25,37.8125,0.0,12.47,...,14.0,17.75,0.0,10.0,3.338125,3.08,1.656742,2.16,4.335,walking
86,87,23.5,46.25,34.873229,35.25,4.53172,31.75,38.25,0.0,14.82,...,13.75,18.0,0.0,9.51,3.424646,3.27,1.69096,2.17,4.5,walking
87,88,19.25,44.0,34.473188,35.0,4.796705,31.25,38.0,0.0,13.86,...,13.73,17.75,0.43,9.0,3.340458,3.09,1.699114,2.12,4.375,walking


In [6]:
time_domain_df.iloc[10]

Instance            12
Min(1)            19.0
Max(1)            45.5
Mean(1)      30.938104
Median(1)         29.0
STD(1)        7.684146
Q1(1)            26.75
Q3(1)             38.0
Min(2)             0.0
Max(2)             6.4
Mean(2)       0.467167
Median(2)         0.43
STD(2)        0.734444
Q1(2)              0.0
Q3(2)              0.5
Min(3)             0.0
Max(3)           32.75
Mean(3)      14.589833
Median(3)        15.75
STD(3)        7.638935
Q1(3)           9.6875
Q3(3)            20.69
Min(4)             0.0
Max(4)           11.42
Mean(4)       0.777542
Median(4)         0.47
STD(4)        1.014102
Q1(4)              0.0
Q3(4)             1.12
Min(5)            1.67
Max(5)            36.0
Mean(5)      18.389083
Median(5)         17.5
STD(5)        5.845911
Q1(5)             15.0
Q3(5)          20.8125
Min(6)             0.0
Max(6)            6.73
Mean(6)       1.107354
Median(6)         0.83
STD(6)        1.080842
Q1(6)             0.47
Q3(6)              1.3
Label      

## iii. Estimate the standard deviation of each of the time-domain features you extracted from the data. Then, use Python’s bootstrapped or any other method to build a 90% bootsrap confidence interval for the standard deviation of each feature.

In [7]:
boot_cols = ['Column', 'Standard Deviation', '90% Bootstrapped Confidence Intervals [Low, High]']
rows = []
for col in time_domain_df.columns[1:-1]:
    data = (time_domain_df[col].values,)
#     bootstrap_ci = bs.bootstrap(time_domain_df.loc[:,col].values, stat_func=bs_stats.std, alpha= 0.10)
#     print(bootstrap_ci)
    bootstrap_ci = bootstrap(data, np.std, confidence_level=0.9, random_state=1, method='percentile')
    rows.append((col, time_domain_df[col].std(), bootstrap_ci.confidence_interval))

ci_df = pd.DataFrame(rows, columns=boot_cols)
ci_df

Unnamed: 0,Column,Standard Deviation,"90% Bootstrapped Confidence Intervals [Low, High]"
0,Min(1),9.624011,"(8.266658208772547, 10.770272712649609)"
1,Max(1),4.207745,"(3.1044425239434412, 5.092886632759041)"
2,Mean(1),5.276431,"(4.600736312982632, 5.797527564421353)"
3,Median(1),5.386624,"(4.6864253850924635, 5.923912148591641)"
4,STD(1),1.771282,"(1.5559942097631383, 1.9371536424144775)"
5,Q1(1),6.127846,"(5.490872239115681, 6.585646917498014)"
6,Q3(1),5.031028,"(4.173957386580788, 5.723833417473686)"
7,Min(2),0.0,"(0.0, 0.0)"
8,Max(2),5.059656,"(4.600737679163739, 5.36638384937238)"
9,Mean(2),1.577908,"(1.3922665722056116, 1.7005467985081049)"


## iv. Use your judgement to select the three most important time-domain features (one option may be min, mean, and max).

> Mean, First Quartile and Third Quartile are probably the most important time-domain features, however, there can be more. The mean is important because in Time Series Data we usually compute the rolling-average or rolling-mean to study the data. The first and the third quartile can be helpful in determining the outliers that can skew the forecasting process.

# 2. ISLR 3.7.4
# I collect a set of data (n = 100 observations) containing a single predictor and a quantitative response. I then fit a linear regression model to the data, as well as a separate cubic regression, i.e. Y = β0 +β1X +β2X2 +β3X3 +ε.

## (a) Suppose that the true relationship between X and Y is linear, i.e. Y = β0 + β1X + ε. Consider the training residual sum of squares (RSS) for the linear regression, and also the training RSS for the cubic regression. Would we expect one to be lower than the other, would we expect them to be the same, or is there not enough information to tell? Justify your answer.

Cubic Polynomial Regression would have a lower training RSS compared to Linear Regression because the cubic polynomial regression probably makes a tighter fit onto the data (i.e, more flexible model overfits the training data)

## (b) Answer (a) using test rather than training RSS.

Cubic Polynomial Regression would have a higher Training RSS than Linear Regression, since the cubic polynomial would overfit the training data and will not generalise well.

## (c) Suppose that the true relationship between X and Y is not linear, but we don’t know how far it is from linear. Consider the training RSS for the linear regression, and also the training RSS for the cubic regression. Would we expect one to be lower than the other, would we expect them to be the same, or is there not enough information to tell? Justify your answer.

Irrespective of the underlying true behavior, a more flexible cublic polynomial regression model would fit the training data much tighter than a linear regression model and hence will have a lower Train RSS.

## (d) Answer (c) using test rather than training RSS.

The information provided is not sufficient to draw conclusions as to which model (Cubic or Linear) will give a lower Test RSS. This is because the degree of difference in the true relationship is not defined clearly (i.e, "we don't know how far it is from linear"). Linear model could have a lower test RSS if the true relationship is closer to linear than to cubic and vice versa. Hence, we need more information as Bias-Variance trade-off adds another layer of uncertainty as to which model could have a lower test RSS.