# Name : Prince Verma
# GitHub UserName: vermaprince17
# USC-ID : 7468967920
# USC Net ID: princeve

In [196]:
import numpy as np
import pandas as pd
import seaborn as sea
from matplotlib import pyplot as plt
import scipy.stats as stats
import operator
from beautifultable import BeautifulTable

import glob

In [197]:
COMMON_PATH = "../data/AReM/"

#number of directories is the number of classes for classification
directories = glob.glob(COMMON_PATH + "/*")
dir_names = [directory.split("/")[-1] for directory in directories]

columns = ["avg_rss12", "var_rss12", "avg_rss13", "var_rss13", "avg_rss23", "var_rss23"]

In [198]:
def read_data(PATH):
    '''
    Given the root folder path containg the dataset (AeRM),
    stores the data in DF's and returns a dict.
    ret: dict(filename: str, data: pd.DataFrame)
    '''
    time_series_data= {}
    TOTAL_PATH = PATH + "/*/*.csv"
    
    for path in glob.glob(TOTAL_PATH):
        
        df = pd.read_csv(path, skiprows=4)
        if df.isna().values.any():
            temp_datas = []
            
            for key,row in df.iterrows():
                temp_data = row.iloc[0]
                temp_data = temp_data.strip().split(" ")
                temp_data = [float(ele) for ele in temp_data]
                temp_datas.append(temp_data)

            new_df = pd.DataFrame(temp_datas, columns= df.columns, dtype= float)
            new_df.drop(new_df.columns[[0]],axis = 1, inplace= True)

        else:
            df.drop(df.columns[[0]],axis = 1, inplace= True)
            new_df = df
        
        temp = path.split("/")[-1]
        dataset_num = ""
        for char in temp:
            if char.isdigit():
                dataset_num += char
                
        name = path.split("/")[-2] + "/" + "dataset_" + dataset_num+".csv"
        time_series_data[name] = new_df
    
    return time_series_data

# 1. Time Series Classification Part 1: Feature Creation/Extraction

An interesting task in machine learning is classification of time series. In this problem, we will classify the activities of humans based on time series obtained by a Wireless Sensor Network.

**(a) Download the AReM data from:** https://archive.ics.uci.edu/ml/datasets/Activity+Recognition+system+based+on+Multisensor+data+fusion+\%28AReM\%29 .The dataset contains 7 folders that represent seven types of activities. In each folder, there are multiple files each of which represents an instant of a human performing an activity. Each file containis 6 time series collected from activities of the same person, which are called avgrss12, varrss12, avgrss13, varrss13,vgrss23, and arrss23. There are 88 instances in the dataset, each of which contains 6 time series and each time series has 480 consecutive values.

# Data Processing

In [199]:
time_series_data = read_data(COMMON_PATH)

for name, df in time_series_data.items():
    print('******************************')
    print("file_name: ", name)
    print("data_shape:", df.shape)
    print("sample_data: ")
    print(df.head())
    print('******************************')
    print()

print("Total no. of dataset instances: ", len(time_series_data))

******************************
file_name:  bending1/dataset_7.csv
data_shape: (480, 6)
sample_data: 
   avg_rss12  var_rss12  avg_rss13  var_rss13  avg_rss23  var_rss23
0      42.00       0.00      18.50       0.50      12.00       0.00
1      42.00       0.00      18.00       0.00      11.33       0.94
2      42.75       0.43      16.75       1.79      18.25       0.43
3      42.50       0.50      16.75       0.83      19.00       1.22
4      43.00       0.82      16.25       0.83      18.00       0.00
******************************

******************************
file_name:  bending1/dataset_6.csv
data_shape: (480, 6)
sample_data: 
   avg_rss12  var_rss12  avg_rss13  var_rss13  avg_rss23  var_rss23
0      41.25       1.30      24.00       0.00      32.33       0.94
1      41.00       1.00      22.50       1.12      33.50       0.87
2      44.25       0.83      21.75       0.83      31.50       1.50
3      45.00       0.00      20.00       1.22      30.00       0.00
4      45.25      

**1. (b) Keep datasets 1 and 2 in folders bending1 and bending 2, as well as datasets 1, 2, and 3 in other folders as test data and other datasets as train data.**

In [200]:
def train_test_data(time_series_data, data_type):
    '''
    creates the data according to above question
    returns a DataFrame
    '''
    final_data = {}
    if data_type == 'test':
        for name, df in time_series_data.items():
            dataset = name.split("/")[-1]
            dataset_folder = name.split("/")[0]
            
            dataset = dataset.split("_")[-1]
            dataset_num = int(dataset.split(".")[0])
            
            if dataset_folder == 'bending1' or dataset_folder == 'bending2':
                if dataset_num <= 2:
                    final_data[name] = df                
                else:
                    continue               
            else:
                if dataset_num <= 3:
                    final_data[name] = df
                else:
                    continue
    
    else:
        for name, df in time_series_data.items():
            dataset = name.split("/")[-1]
            dataset_folder = name.split("/")[0]
            
            dataset = dataset.split("_")[-1]
            dataset_num = int(dataset.split(".")[0])
            
            if dataset_folder == 'bending1' or dataset_folder == 'bending2':
                if dataset_num >= 3:
                    final_data[name] = df                
                else:
                    continue               
            else:
                if dataset_num >= 4:
                    final_data[name] = df
                else:
                    continue
        
    return final_data 

In [201]:
test_data = train_test_data(time_series_data, 'test')

for name, df in test_data.items():
    print('******************************')
    print("file_name: ", name)
    print("data_shape:", df.shape)
    print("sample_data: ")
    print(df.head())
    print('******************************')
    print()

print("Total no. of dataset instances: ", len(test_data))

******************************
file_name:  bending1/dataset_1.csv
data_shape: (480, 6)
sample_data: 
   avg_rss12  var_rss12  avg_rss13  var_rss13  avg_rss23  var_rss23
0      39.25       0.43      22.75       0.43      33.75        1.3
1      39.25       0.43      23.00       0.00      33.00        0.0
2      39.25       0.43      23.25       0.43      33.00        0.0
3      39.50       0.50      23.00       0.71      33.00        0.0
4      39.50       0.50      24.00       0.00      33.00        0.0
******************************

******************************
file_name:  bending1/dataset_2.csv
data_shape: (480, 6)
sample_data: 
   avg_rss12  var_rss12  avg_rss13  var_rss13  avg_rss23  var_rss23
0      43.67       0.47      24.75       0.43      30.00       0.00
1      43.33       0.47      25.33       0.47      30.00       0.00
2      42.75       0.83      25.25       0.83      30.50       0.50
3      41.75       0.43      20.50       3.35      30.75       0.83
4      42.50      

In [202]:
train_data = train_test_data(time_series_data, 'train')

for name, df in train_data.items():
    print('******************************')
    print("file_name: ", name)
    print("data_shape:", df.shape)
    print("sample_data: ")
    print(df.head())
    print('******************************')
    print()

print("Total no. of dataset instances: ", len(train_data))

******************************
file_name:  bending1/dataset_7.csv
data_shape: (480, 6)
sample_data: 
   avg_rss12  var_rss12  avg_rss13  var_rss13  avg_rss23  var_rss23
0      42.00       0.00      18.50       0.50      12.00       0.00
1      42.00       0.00      18.00       0.00      11.33       0.94
2      42.75       0.43      16.75       1.79      18.25       0.43
3      42.50       0.50      16.75       0.83      19.00       1.22
4      43.00       0.82      16.25       0.83      18.00       0.00
******************************

******************************
file_name:  bending1/dataset_6.csv
data_shape: (480, 6)
sample_data: 
   avg_rss12  var_rss12  avg_rss13  var_rss13  avg_rss23  var_rss23
0      41.25       1.30      24.00       0.00      32.33       0.94
1      41.00       1.00      22.50       1.12      33.50       0.87
2      44.25       0.83      21.75       0.83      31.50       1.50
3      45.00       0.00      20.00       1.22      30.00       0.00
4      45.25      

**1. (c) Feature Extraction**

Classification of time series usually needs extracting features from them. In this
problem, we focus on time-domain features.

**(i)Research what types of time-domain features are usually used in time series
classification and list them (examples are minimum, maximum, mean, etc).**



# Some of the Time Domain features used in time-series classification are:

1. Mean value of the time series
2. Median value of the time series
3. Standard deviation of the time series
4. Minimum and maximum value of the time series
5. Kurtosis or skewness of the time series
6. Auto correlation
7. Cross correlation
8. Moving average component of the time series
9. Auto regressive component of the time series
10. Number of positive and negative peaks in the time series.

**1. (c) (ii) Extract the time-domain features minimum, maximum, mean, median, standard deviation, first quartile, and third quartile for all of the 6 time series
in each instance. You are free to normalize/standardize features or use them directly.**

In [207]:
dir_names = ['bending1','bending2','cycling','lying','sitting','standing','walking']
dataset = [7,6,15,15,15,15,15] # no of datasets in each folder

parameters = ['Min', 'Max', 'Mean', 'Median', 'Std', 'First_quartile', 'Third_quartile']

stats = {}
stats["file_name"] = []
for col in columns:
    for param in parameters:
        stats[param + "_" + col] = []
        
for i, activity in enumerate(dir_names):
    for dataset_num in range(1, dataset[i]+1):
        
        file_name = activity + "/dataset_" + str(dataset_num) + ".csv"
        df = time_series_data[file_name]
        
        stats["file_name"].append(file_name) 
        for col in df.columns:
            feature = pd.DataFrame(df[col])
                
            stats["Min_"+col].append(np.min(feature)[col])
            stats["Max_"+col].append(np.max(feature)[col])
            stats["Mean_"+col].append(np.mean(feature)[col])
            stats["Median_"+col].append(np.median(feature))
            stats["Std_"+col].append(np.std(feature)[col])
            stats["First_quartile_"+col].append(np.quantile(feature, 0.25))
            stats["Third_quartile_"+col].append(np.quantile(feature, 0.75))        
    
time_domain_stats = pd.DataFrame.from_dict(stats)
time_domain_stats

# Please Scroll left to right since there are 42 + 1 features present.

# to match all the 88 rows outputs 
# run -> 
# pd.set_option('display.max_rows', None); # displays all the 88 rows
# time_domain_stats

Unnamed: 0,file_name,Min_avg_rss12,Max_avg_rss12,Mean_avg_rss12,Median_avg_rss12,Std_avg_rss12,First_quartile_avg_rss12,Third_quartile_avg_rss12,Min_var_rss12,Max_var_rss12,...,Std_avg_rss23,First_quartile_avg_rss23,Third_quartile_avg_rss23,Min_var_rss23,Max_var_rss23,Mean_var_rss23,Median_var_rss23,Std_var_rss23,First_quartile_var_rss23,Third_quartile_var_rss23
0,bending1/dataset_1.csv,37.25,45.00,40.624792,40.50,1.475428,39.25,42.0000,0.0,1.30,...,2.186168,33.0000,36.00,0.00,1.92,0.570583,0.43,0.582308,0.0000,1.3000
1,bending1/dataset_2.csv,38.00,45.67,42.812812,42.50,1.434054,42.00,43.6700,0.0,1.22,...,1.993175,32.0000,34.50,0.00,3.11,0.571083,0.43,0.600383,0.0000,1.3000
2,bending1/dataset_3.csv,35.00,47.40,43.954500,44.33,1.557210,43.00,45.0000,0.0,1.70,...,1.997520,35.3625,36.50,0.00,1.79,0.493292,0.43,0.512971,0.0000,0.9400
3,bending1/dataset_4.csv,33.00,47.75,42.179813,43.50,3.666840,39.15,45.0000,0.0,3.00,...,3.845436,30.4575,36.33,0.00,2.18,0.613521,0.50,0.523771,0.0000,1.0000
4,bending1/dataset_5.csv,33.00,45.75,41.678063,41.75,2.241152,41.33,42.7500,0.0,2.83,...,2.408514,28.4575,31.25,0.00,1.79,0.383292,0.43,0.388759,0.0000,0.5000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,walking/dataset_11.csv,19.50,45.33,33.586875,34.25,4.646088,30.25,37.0000,0.0,14.67,...,3.280561,13.7300,18.25,0.00,8.32,3.259729,3.11,1.638534,2.0500,4.3225
84,walking/dataset_12.csv,19.75,45.50,34.322750,35.25,4.747524,31.00,38.0000,0.0,13.47,...,3.116605,13.5000,17.75,0.00,9.67,3.432562,3.20,1.730921,2.1575,4.5650
85,walking/dataset_13.csv,19.50,46.00,34.546229,35.25,4.837247,31.25,37.8125,0.0,12.47,...,2.820182,14.0000,17.75,0.00,10.00,3.338125,3.08,1.655016,2.1600,4.3350
86,walking/dataset_14.csv,23.50,46.25,34.873229,35.25,4.526997,31.75,38.2500,0.0,14.82,...,3.127813,13.7500,18.00,0.00,9.51,3.424646,3.27,1.689198,2.1700,4.5000


**1. (c) (iii) Estimate the standard deviation of each of the time-domain features you extracted from the data. Then, use Python’s bootstrapped or any other
method to build a 90% bootsrap confidence interval for the standard deviation of each feature**

In [212]:
# Standard deivation of each of the above feature

std = time_domain_stats.std()
output_std_42_feature = pd.DataFrame(std, columns= ["Standard Deviation"])
output_std_42_feature

Unnamed: 0,Standard Deviation
Min_avg_rss12,9.569975
Max_avg_rss12,4.394362
Mean_avg_rss12,5.335718
Median_avg_rss12,5.440054
Std_avg_rss12,1.770306
First_quartile_avg_rss12,6.15359
Third_quartile_avg_rss12,5.138925
Min_var_rss12,0.0
Max_var_rss12,5.062729
Mean_var_rss12,1.574164


**90% bootsrap confidence interval for the standard deviation of each feature**

In [213]:
import bootstrapped.bootstrap as boot
import bootstrapped.stats_functions as boot_stats

In [221]:
temp = []
for feature in time_domain_stats.columns.to_list():
    if feature == 'file_name':
        continue
    
    samples = np.array(time_domain_stats[feature])
    res = boot.bootstrap(samples, stat_func= boot_stats.std, alpha= 0.1)
    temp.append([feature, np.std(samples), res.lower_bound, res.upper_bound])

ans = pd.DataFrame(temp, columns= ['feature_name', 'standard_deviation',
                                   'lower_confidence_interval', 'upper_confidence_interval'])


ans

Unnamed: 0,feature_name,standard_deviation,lower_confidence_interval,upper_confidence_interval
0,Min_avg_rss12,9.515445,8.303564,10.810914
1,Max_avg_rss12,4.369322,3.482179,5.434671
2,Mean_avg_rss12,5.305314,4.760074,5.934425
3,Median_avg_rss12,5.409056,4.853968,6.045031
4,Std_avg_rss12,1.760219,1.587432,1.956099
5,First_quartile_avg_rss12,6.118526,5.633919,6.703416
6,Third_quartile_avg_rss12,5.109643,4.409009,5.926676
7,Min_var_rss12,0.0,0.0,0.0
8,Max_var_rss12,5.033882,4.698205,5.459989
9,Mean_var_rss12,1.565194,1.431396,1.746473


**1. (c) iv. Use your judgement to select the three most important time-domain features**

1. Median - Its a good measure of central tendancy that is not affected by outliers.
2. Mean - Its a better measure of central tendancy when the data is continous and to capture the aggregate value.
3. Standard deviation - Its a measure of dispersion of the dataset.

# 2. ISLR 3.7.4

(a) The results would be similar. As we are increasing the flexibility of the model i.e going from linear to cubic regression and given that we have small no. of samples, the training error for cubic regression would be lesser compared to training error of linear regression (over fitting).

(b) Since the true relationship is linear, linear regression would slightly perform better than cubic regression on test RSS. Since the data points are closely linear, the cubic regression may fit close to linear relation (not wildly cubic).

(c) Clearly cubic regression would outperform linear regression since increasing model flexibility would decrease training error. Since the data is not linearly seperated the results in (a) were almost same, but in this case they would be different.

(d) The answer to this question would depend on how well the data is linearly or non linearly separated. If the data is wildly away from linear relationship, then a cubic regression would perform well on test RSS and vice versa. So there is not enough information to decide. 

# 4. Time Series Classification Part 2: Binary and Multiclass Classification