## Prep

In [1]:
import numpy as np
import pandas as pd
import os
import re

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
processed_folder = '/content/drive/MyDrive/UIT/UIT_Graduation_Thesis/Dataset/1_Processed/CSI_Filtered_Amp'
# raw_folder_ss1 = '/content/drive/MyDrive/UIT/UIT_Graduation_Thesis/Dataset/0_Raw/2024-09-28/CSI_Packets/session_1'
# raw_folder_ss2 = '/content/drive/MyDrive/UIT/UIT_Graduation_Thesis/Dataset/0_Raw/2024-09-28/CSI_Packets/session_2'

# List all files in the folder
processed_list = [f for f in os.listdir(processed_folder) if f.endswith('.csv')]

dfs = []
for file_name in processed_list:
    match = re.search(r"csi-filtered-d2-sess0(\d+)-(\d+)", file_name)
    if match:
      print(file_name)
      session = match.group(1)
      label = match.group(2)
      print("label:", label)
      file_path = os.path.join(processed_folder, file_name)
      df = pd.read_csv(file_path)

      # raw_folder = raw_folder_ss1 if session == '1' else raw_folder_ss2
      # raw_file_name = f'{label}_persons{"_cont" if session == "2" else ""}.csv'
      # raw_file_path = os.path.join(raw_folder, raw_file_name)

      # if os.path.exists(raw_file_path):
      #   rssi_df = pd.read_csv(raw_file_path, usecols=['rssi'])
      #   if not rssi_df.empty:  # Check if rssi_df is not empty
      #     df["rssi"] = rssi_df["rssi"]
      df["label"] = int(label)
      dfs.append(df)
print("Number of dataframes loaded: ", len(dfs))

csi-filtered-d2-sess02-0.csv
label: 0
csi-filtered-d2-sess02-1.csv
label: 1
csi-filtered-d2-sess02-2.csv
label: 2
csi-filtered-d2-sess02-3.csv
label: 3
csi-filtered-d2-sess02-4.csv
label: 4
csi-filtered-d2-sess02-5.csv
label: 5
csi-filtered-d2-sess01-0.csv
label: 0
csi-filtered-d2-sess01-1.csv
label: 1
csi-filtered-d2-sess01-2.csv
label: 2
csi-filtered-d2-sess01-3.csv
label: 3
csi-filtered-d2-sess01-4.csv
label: 4
csi-filtered-d2-sess01-5.csv
label: 5
Number of dataframes loaded:  12


In [13]:
print(dfs[5].shape)

(16396, 53)


In [14]:
print(dfs[5].head(5))

          0         1         2         3         4         5         6  \
0  7.538905  5.629980  8.003213  5.117724  5.662561  4.506746  6.996588   
1  6.322714  5.622496  6.973410  5.937543  4.286260  5.939179  5.991496   
2  5.932204  5.643473  6.440593  6.354934  4.107257  6.781698  5.914232   
3  6.149933  5.783365  6.363875  6.593334  4.782093  7.219822  6.511403   
4  6.758459  6.132624  6.702374  6.876180  5.967302  7.439070  7.529615   

          7          8         9  ...         43         44         45  \
0  4.660907   4.587330  8.202784  ...  11.526423  14.924208  10.744154   
1  4.590781   6.797308  7.537395  ...  11.408004  13.287742  10.897880   
2  5.146325   8.569868  7.699896  ...  12.204009  13.329391  12.069494   
3  6.162476   9.944707  8.515245  ...  13.595951  14.544342  13.947154   
4  7.474170  10.961520  9.808398  ...  15.265338  16.427778  16.219013   

          46         47         48         49         50         51  label  
0  17.383009  17.938799  15

## Feature Extraction

### Segmentation and extraction

In [6]:
def segment_data(data, window_size=200, overlap=50):
    """
    Segments data into overlapping windows.

    Args:
        data (numpy.ndarray): The data to segment (CSI or RSS).
        window_size (int): Size of each window in samples.
        overlap (int): Overlap between consecutive windows in samples.

    Returns:
        list: A list of segmented data windows.
    """

    segmented_data = []
    for i in range(0, data.shape[0] - window_size + 1, window_size - overlap):
        window = data[i:i+window_size] if data.ndim == 1 else data[i:i+window_size, :]
        segmented_data.append(window)
    return segmented_data

In [7]:
def extract_features(csi_bundle):
    """
    Args:
        csi_bundle: numpy array containing CSI amplitude data (shape: [packets, subcarriers])

    Returns:
        A dictionary containing the calculated statistics.
    """
    features = {}
    for i in range(csi_bundle.shape[1] - 1): # Loop through each subcarrier
      features[f'std_subcarrier_{i}'] = np.std(csi_bundle[:, i]) # Standard deviation
      features[f'mean_subcarrier_{i}'] = np.mean(csi_bundle[:, i]) # The average amplitude value
      features[f'max_subcarrier_{i}'] = np.max(csi_bundle[:, i])
      features[f'min_subcarrier_{i}'] = np.min(csi_bundle[:, i])
      features[f'qtu_subcarrier_{i}'] = np.percentile(csi_bundle[:, i], 75) # Upper quartile
      features[f'qtl_subcarrier_{i}'] = np.percentile(csi_bundle[:, i], 25) # Lower quartile
      features[f'iqr_subcarrier_{i}'] = features[f'qtu_subcarrier_{i}'] - features[f'qtl_subcarrier_{i}']

    for i in range(2, csi_bundle.shape[1] - 3):  # Skip the first and last 2 subcarriers
        num_cols = csi_bundle.shape[1]
        start_idx = max(0, i - 2)
        end_idx = min(num_cols, i + 2 + 1)
        adjacent_data = np.delete(csi_bundle[:, start_idx:end_idx], i - start_idx, axis=1)
        # Calculate the amplitude difference for the current subcarrier
        amplitude_difference = np.sum(np.abs(adjacent_data - csi_bundle[:, [i]]), axis=1)
        features[f'adj_subcarrier_{i}'] = np.mean(amplitude_difference)

    euclidean_distances = []
    for i in range(1, csi_bundle.shape[0]):  # Loop through packets starting from the second
      distances = np.linalg.norm(csi_bundle[i, :] - csi_bundle[i-1, :], axis=0)
      euclidean_distances.append(distances)

    features['euc'] = np.median(euclidean_distances)
    # features['rss_std']= np.std(csi_bundle[:, -1])
    features = pd.DataFrame([features])
    return features

In [15]:
# Segementation and Extraction begin here
features_list = []
features_labels = []
for df in dfs:
  print(f"Extracting features for CSI data of {df['label'][0]} people")
  temp_df = df.copy()
  temp_df = temp_df.drop("label", axis=1)
  temp_np_array = np.array(temp_df) # Turn into numpy array for easier matrix calculation
  segments = segment_data(temp_np_array)
  print(f"There are {len(segments)} segments to extract")
  for segment in segments:
    features = extract_features(segment)
    print(features['adj_subcarrier_2'][0])
    features_list.append(features)
    features_labels.append(df['label'][0])

Extracting features for CSI data of 0 people
There are 116 segments to extract
5.810733520000001
5.8904377724999994
6.071184969999999
5.90533068
6.6859391925
6.8063892975
5.8001506874999995
6.505625677499999
6.64750819
6.188444665
5.927621232500001
7.01981809
6.287562482500001
5.2444663695000004
6.211160459499999
5.499144462499999
5.0374310875
6.5323574725
5.997774645000001
4.637327157500001
5.96956724
5.2541069225
5.785918785
5.41841472
6.548309710000001
5.8270043225
7.045416697499999
7.51592623
5.868491879999999
5.513634821999999
5.652116487000001
5.15356476
7.270665222500001
7.2863017925
8.0653102325
10.88536474
11.613331429999999
10.008881965
6.55442998
9.11862284
12.672119695
11.388390545
7.996590779999999
9.507355445
11.680828225
12.392404594999999
9.162217705
7.021834144999999
5.826685345
6.018153099999999
6.41301092
6.756283155
6.6156177750000005
5.359506795
5.3322274274999995
7.136278842500001
7.5934312675
7.4870582825
7.032200409999999
10.103194905
10.39348977
6.2750423799999

In [16]:
print(len(features_list))

1354


In [17]:
test_df = pd.DataFrame(features_list[0])
test_df.head(5)

Unnamed: 0,std_subcarrier_0,mean_subcarrier_0,max_subcarrier_0,min_subcarrier_0,qtu_subcarrier_0,qtl_subcarrier_0,iqr_subcarrier_0,std_subcarrier_1,mean_subcarrier_1,max_subcarrier_1,...,adj_subcarrier_40,adj_subcarrier_41,adj_subcarrier_42,adj_subcarrier_43,adj_subcarrier_44,adj_subcarrier_45,adj_subcarrier_46,adj_subcarrier_47,adj_subcarrier_48,euc
0,2.103277,23.369059,28.606596,17.618135,24.874141,21.991571,2.88257,3.157244,23.125368,37.042778,...,5.732918,5.23889,6.658996,4.438769,4.426577,5.092532,5.697827,5.102446,4.151651,4.743274


### Export features

In [18]:
label_groups = {}

# Group features by label
for features, label in zip(features_list, features_labels):
    if label not in label_groups:
        label_groups[label] = []
    label_groups[label].append(features)

# Concatenate and export DataFrames for each label
for label, dfs in label_groups.items():
    concatenated_df = pd.concat(dfs, ignore_index=True)
    name = f"csi-features-final-d2-{label}.csv"
    concatenated_df.to_csv(f'/content/drive/MyDrive/UIT/UIT_Graduation_Thesis/Dataset/1_Processed/CSI_Features/2-seconds-updated/{name}', index=False)