## Prep

In [2]:
import numpy as np
import pandas as pd
import os
import re

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
processed_folder = '/content/drive/MyDrive/UIT/UIT_Graduation_Thesis/Dataset/1_Processed/CSI_Filtered_Amp'
raw_folder_ss1 = '/content/drive/MyDrive/UIT/UIT_Graduation_Thesis/Dataset/0_Raw/2024-09-28/CSI_Packets/session_1'
raw_folder_ss2 = '/content/drive/MyDrive/UIT/UIT_Graduation_Thesis/Dataset/0_Raw/2024-09-28/CSI_Packets/session_2'

# List all files in the folder
processed_list = [f for f in os.listdir(processed_folder) if f.endswith('.csv')]

dfs = []
for file_name in processed_list:
    match = re.search(r"csi-filtered-d1-sess0(\d+)-(\d+)", file_name)
    if match:
      print(file_name)
      session = match.group(1)
      label = match.group(2)
      print("label:", label)
      file_path = os.path.join(processed_folder, file_name)
      df = pd.read_csv(file_path)

      raw_folder = raw_folder_ss1 if session == '1' else raw_folder_ss2
      raw_file_name = f'{label}_persons{"_cont" if session == "2" else ""}.csv'
      raw_file_path = os.path.join(raw_folder, raw_file_name)

      if os.path.exists(raw_file_path):
        rssi_df = pd.read_csv(raw_file_path, usecols=['rssi'])
        if not rssi_df.empty:  # Check if rssi_df is not empty
          df["rssi"] = rssi_df["rssi"]
      df["label"] = int(label)
      dfs.append(df)
print("Number of dataframes loaded: ", len(dfs))

csi-filtered-d1-sess02-0.csv
label: 0
csi-filtered-d1-sess02-1.csv
label: 1
csi-filtered-d1-sess02-2.csv
label: 2
csi-filtered-d1-sess02-3.csv
label: 3
csi-filtered-d1-sess02-4.csv
label: 4
csi-filtered-d1-sess02-5.csv
label: 5
csi-filtered-d1-sess01-0.csv
label: 0
csi-filtered-d1-sess01-1.csv
label: 1
csi-filtered-d1-sess01-2.csv
label: 2
csi-filtered-d1-sess01-3.csv
label: 3
csi-filtered-d1-sess01-4.csv
label: 4
csi-filtered-d1-sess01-5.csv
label: 5
Number of dataframes loaded:  12


In [4]:
print(dfs[5].shape)

(16413, 54)


In [None]:
print(dfs[5].head(5))

          0         1          2          3          4          5          6  \
0  6.856949  5.269875   7.714059   8.905305   8.597433  10.339726  12.420630   
1  7.911175  7.323215   8.941853  11.844399  10.477811  11.429932  14.011089   
2  8.707656  8.641500  10.204781  13.145867  11.801256  12.291970  14.955239   
3  9.233672  9.336433  11.365571  13.224723  12.621846  12.922443  15.350582   
4  9.476501  9.519715  12.286951  12.495981  12.993661  13.317955  15.294616   

           7          8          9  ...         44         45         46  \
0  11.518278  11.942980  13.156781  ...  21.828870  21.556131  20.271051   
1  13.766368  14.628056  17.159597  ...  25.920977  26.293710  25.953205   
2  14.982780  16.366386  19.455177  ...  28.387842  29.419357  29.412523   
3  15.383103  17.269861  20.353092  ...  29.484460  31.131224  31.017698   
4  15.182922  17.450370  20.162910  ...  29.465820  31.627470  31.137420   

          47         48         49         50         51  rssi

## Feature Extraction

### Segmentation and extraction

In [5]:
def segment_data(data, window_size=200, overlap=50):
    """
    Segments data into overlapping windows.

    Args:
        data (numpy.ndarray): The data to segment (CSI or RSS).
        window_size (int): Size of each window in samples.
        overlap (int): Overlap between consecutive windows in samples.

    Returns:
        list: A list of segmented data windows.
    """

    segmented_data = []
    for i in range(0, data.shape[0] - window_size + 1, window_size - overlap):
        window = data[i:i+window_size] if data.ndim == 1 else data[i:i+window_size, :]
        segmented_data.append(window)
    return segmented_data

In [30]:
def extract_features(csi_bundle):
    """
    Args:
        csi_bundle: numpy array containing CSI amplitude data (shape: [packets, subcarriers])

    Returns:
        A dictionary containing the calculated statistics.
    """
    features = {}
    for i in range(csi_bundle.shape[1] - 1): # Loop through each subcarrier
      features[f'std_subcarrier_{i}'] = np.std(csi_bundle[:, i]) # Standard deviation
      features[f'mean_subcarrier_{i}'] = np.mean(csi_bundle[:, i]) # The average amplitude value
      features[f'max_subcarrier_{i}'] = np.max(csi_bundle[:, i])
      features[f'min_subcarrier_{i}'] = np.min(csi_bundle[:, i])
      features[f'qtu_subcarrier_{i}'] = np.percentile(csi_bundle[:, i], 75) # Upper quartile
      features[f'qtl_subcarrier_{i}'] = np.percentile(csi_bundle[:, i], 25) # Lower quartile
      features[f'iqr_subcarrier_{i}'] = features[f'qtu_subcarrier_{i}'] - features[f'qtl_subcarrier_{i}']

    for i in range(2, csi_bundle.shape[1] - 3):  # Skip the first and last 2 subcarriers
        num_cols = csi_bundle.shape[1]
        start_idx = max(0, i - 2)
        end_idx = min(num_cols, i + 2 + 1)
        adjacent_data = np.delete(csi_bundle[:, start_idx:end_idx], i - start_idx, axis=1)
        # Calculate the amplitude difference for the current subcarrier
        amplitude_difference = np.sum(np.abs(adjacent_data - csi_bundle[:, [i]]), axis=1)
        features[f'adj_subcarrier_{i}'] = np.mean(amplitude_difference)

    euclidean_distances = []
    for i in range(1, csi_bundle.shape[0]):  # Loop through packets starting from the second
      distances = np.linalg.norm(csi_bundle[i, :] - csi_bundle[i-1, :], axis=0)
      euclidean_distances.append(distances)

    features['euc'] = np.median(euclidean_distances)
    features['rss_std']= np.std(csi_bundle[:, -1])
    features = pd.DataFrame([features])
    return features

In [31]:
# Segementation and Extraction begin here
features_list = []
features_labels = []
for df in dfs:
  print(f"Extracting features for CSI data of {df['label'][0]} people")
  temp_df = df.copy()
  temp_df = temp_df.drop("label", axis=1)
  temp_np_array = np.array(temp_df) # Turn into numpy array for easier matrix calculation
  segments = segment_data(temp_np_array)
  print(f"There are {len(segments)} segments to extract")
  for segment in segments:
    features = extract_features(segment)
    print(features['adj_subcarrier_2'][0])
    features_list.append(features)
    features_labels.append(df['label'][0])

Extracting features for CSI data of 0 people
There are 104 segments to extract
5.787664167500001
6.20081515
7.007214715
7.141798575
7.08215691
6.719919115
6.120445087499999
6.851472082499999
7.3111575675
7.4295189725
7.395410660000001
7.3623494125
6.2114283225
6.527422535
7.5055822675
7.671882355
7.497650374999999
7.5170084175
7.347980864999999
7.122320735000001
6.2199517875
6.8317822024999995
8.149262215
7.856209464999999
6.871529115
7.782372515
7.4729342125
6.409748509999999
8.032716664999999
7.019690740000001
6.611447567500001
6.793326865000001
6.5043059625
6.941851657499999
6.8404754075
6.349759527500001
6.8401710475
7.430814624999999
7.891097562499999
7.058008225
7.084047815
6.797757547500001
6.310544472500001
6.6667940475
6.933672537500001
6.243341575
7.390970117499998
8.178992565000001
6.589697045
6.938260377500001
7.3844570075
6.70624396
7.2743758650000006
6.9192086999999995
6.857613885000001
7.5993942825
7.9362631775
7.5488411975
8.029184815
7.689830509999999
7.8945354425
8.85

In [26]:
print(len(features_list))

1260


In [32]:
test_df = pd.DataFrame(features_list[0])
test_df.head(5)

Unnamed: 0,std_subcarrier_0,mean_subcarrier_0,max_subcarrier_0,min_subcarrier_0,qtu_subcarrier_0,qtl_subcarrier_0,iqr_subcarrier_0,std_subcarrier_1,mean_subcarrier_1,max_subcarrier_1,...,adj_subcarrier_42,adj_subcarrier_43,adj_subcarrier_44,adj_subcarrier_45,adj_subcarrier_46,adj_subcarrier_47,adj_subcarrier_48,adj_subcarrier_49,euc,rss_std
0,2.458781,18.741377,23.436087,12.873449,20.788364,16.796083,3.992281,2.175019,18.433488,22.899534,...,7.725777,4.387893,4.157481,2.961773,3.508334,3.186696,3.576259,3.650341,4.070442,1.086784


### Export features

In [33]:
label_groups = {}

# Group features by label
for features, label in zip(features_list, features_labels):
    if label not in label_groups:
        label_groups[label] = []
    label_groups[label].append(features)

# Concatenate and export DataFrames for each label
for label, dfs in label_groups.items():
    concatenated_df = pd.concat(dfs, ignore_index=True)
    name = f"csi-features-full-d1-{label}.csv"
    concatenated_df.to_csv(f'/content/drive/MyDrive/UIT/UIT_Graduation_Thesis/Dataset/1_Processed/CSI_Features/2-seconds-updated/{name}', index=False)