# Prep

In [1]:
!pip install hampel

Collecting hampel
  Downloading hampel-1.0.2.tar.gz (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m868.7 kB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: hampel
  Building wheel for hampel (pyproject.toml) ... [?25l[?25hdone
  Created wheel for hampel: filename=hampel-1.0.2-cp310-cp310-linux_x86_64.whl size=209224 sha256=8b364b39ac80b1b4f6989ff0ae51d84cd3d4ee979877a7eb8b199e13f9b8e065
  Stored in directory: /root/.cache/pip/wheels/0e/c3/3c/8a9f55c3de0b09faf919393d4c6f09b11b7421dcaa7243b820
Successfully built hampel
Installing collected packages: hampel
Successfully installed hampel-1.0.2


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
from scipy.signal import savgol_filter
from hampel import hampel

# Load and filter data

In [5]:
folder_path = '/content/drive/MyDrive/01.School_related/DoAnChuyenNganh/dataset/segments'

# List all files in the folder
file_list = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
dfs = []
labels = []
for file_name in file_list:
    file_path = os.path.join(folder_path, file_name)
    df = pd.read_csv(file_path)
    if(len(df) < 600): # Skip all df with less than 600 rows
      continue
    match = re.search(r"segment-(\d+)-(\d+)", file_name)
    label = match.group(1) # label them based on the first number in the file name
    #df["label"] = int(label)
    labels.append(label)
    dfs.append(df)
print(len(dfs))
print(len(labels))

159
159


In [6]:
filtered_dfs = []
for df in dfs:
  filtered_data = pd.DataFrame()
  for col in df.columns:
    col_series = df[col]
    # Hampel filter
    hampel_filtered = hampel(col_series, window_size=10)
    # Savitzky-Golay filter
    sg_filtered = savgol_filter(hampel_filtered.filtered_data, window_length=10, polyorder=3)
    filtered_data[col] = sg_filtered
  filtered_dfs.append(filtered_data)

print(len(filtered_dfs))

159


## Feature extraction

In [57]:
def calc_amp_diff(df, adj_num):
    diff = []
    for col in df.columns:
      col_series = df[col].values
      column_differences = []
      for i in range(len(col_series)):
        value = col_series[i]
        if i == 0 or i == 1:
            # First subcarrier
            diffVal = np.abs(value - col_series[i + adj_num])
        elif i == len(col_series) - 1 or i == len(col_series) - 2:
            # Last subcarrier
            diffVal = np.abs(value - col_series[i - adj_num])
        else:
            # Middle subcarriers
            diffVal = np.abs(value - col_series[i - adj_num]) + np.abs(value - col_series[i + adj_num])

        column_differences.append(diffVal)

    # Sum of absolute differences
      total_diff = np.sum(column_differences)
    return total_diff

In [60]:
features_dfs = []
for df in filtered_dfs:
  features = pd.DataFrame()
  minVal = df.min()
  maxVal = df.max()
  stdVal = df.std()
  lqVal = df.quantile(0.25)
  uqVal = df.quantile(0.75)
  avgVal = df.mean()
  iqVal = uqVal - lqVal
  #adjVal = calc_amp_diff(df,2)
  features["min"] = minVal
  features["max"] = maxVal
  features["std"] = stdVal
  features["lq"] = lqVal
  features["uq"] = uqVal
  features["avg"] = avgVal
  features["iq"] = iqVal
  #features["adj"] = adjVal
  features_dfs.append(features)


In [61]:
print(len(features_dfs))
print(features_dfs[0].shape)

print(features_dfs[0].head(20))

159
(52, 7)
         min        max       std         lq         uq        avg        iq
0   0.794435  20.150984  4.026044   8.207010  13.441128  10.644927  5.234118
1   1.913195  20.203171  4.038280   8.350379  13.814869  10.876046  5.464490
2   2.387777  22.292124  4.250665   9.213232  14.895330  11.896222  5.682097
3   1.897739  23.927929  4.397853  10.228765  16.133679  12.869132  5.904914
4   0.791161  22.828176  4.135689  10.534134  15.827909  12.855876  5.293775
5   0.933139  21.085730  3.799204  10.329032  15.400228  12.428349  5.071196
6   2.003213  22.533936  3.764604  11.337510  16.097527  13.433006  4.760017
7   1.277509  21.417206  3.581618  11.587980  16.044465  13.542747  4.456485
8   3.168595  22.537552  3.620057  12.847933  17.441644  14.889524  4.593710
9   3.785301  23.183092  3.601205  13.502414  18.121916  15.636689  4.619502
10  4.296395  24.889286  3.673081  14.769424  19.285284  16.816050  4.515859
11  5.791716  25.440170  3.611933  15.802671  20.266236  17.9085

# Flatten data

In [25]:
print(labels[2])

0


In [62]:
flatten_dfs = []
for index in range(len(features_dfs)):
  df = pd.DataFrame(features_dfs[index].values.flatten()).T
  df["label"] = labels[index]
  flatten_dfs.append(df)

print(len(flatten_dfs))

159


In [63]:
print(flatten_dfs[2])

          0          1         2         3          4          5         6  \
0  6.971779  20.328934  2.760731  11.85622  15.478026  13.608288  3.621806   

          7         8         9  ...       355       356       357        358  \
0  6.966369  20.26018  2.704533  ...  12.48483  2.834635  4.562716  17.514118   

        359        360        361        362      363  label  
0  2.148163  10.698039  13.401989  11.991273  2.70395      0  

[1 rows x 365 columns]


In [33]:
print(flatten_dfs[0]["label"][0] + "t")

0t


## Concat data

In [64]:
dfs_0 = []
dfs_1 = []
dfs_2 = []
dfs_3 = []
dfs_4 = []
dfs_5 = []

for df in flatten_dfs:
  if df["label"][0] == "0":
    dfs_0.append(df)
  elif df["label"][0] == "1":
    dfs_1.append(df)
  elif df["label"][0] == "2":
    dfs_2.append(df)
  elif df["label"][0] == "3":
    dfs_3.append(df)
  elif df["label"][0] == "4":
    dfs_4.append(df)
  elif df["label"][0] == "5":
    dfs_5.append(df)

print(len(dfs_0))

print(len(dfs_1))

print(len(dfs_2))

print(len(dfs_3))

print(len(dfs_4))

print(len(dfs_5))

27
27
28
25
27
25


In [65]:
concatenated_df_0 = pd.concat(dfs_0, axis=0, ignore_index=True)
concatenated_df_1 = pd.concat(dfs_1, axis=0, ignore_index=True)
concatenated_df_2 = pd.concat(dfs_2, axis=0, ignore_index=True)
concatenated_df_3 = pd.concat(dfs_3, axis=0, ignore_index=True)
concatenated_df_4 = pd.concat(dfs_4, axis=0, ignore_index=True)
concatenated_df_5 = pd.concat(dfs_5, axis=0, ignore_index=True)

print(concatenated_df_0.shape)
print(concatenated_df_1.shape)
print(concatenated_df_2.shape)
print(concatenated_df_3.shape)
print(concatenated_df_4.shape)
print(concatenated_df_5.shape)

(27, 365)
(27, 365)
(28, 365)
(25, 365)
(27, 365)
(25, 365)


In [66]:
feature_names = ['min', 'max', 'std', 'lq', 'uq', 'avg', 'iq']
new_columns = [f'sub{subcarrier+1}-{suffix}' for subcarrier in range(52) for suffix in feature_names]
new_columns.append("label")
print(new_columns)
print(len(new_columns))

['sub1-min', 'sub1-max', 'sub1-std', 'sub1-lq', 'sub1-uq', 'sub1-avg', 'sub1-iq', 'sub2-min', 'sub2-max', 'sub2-std', 'sub2-lq', 'sub2-uq', 'sub2-avg', 'sub2-iq', 'sub3-min', 'sub3-max', 'sub3-std', 'sub3-lq', 'sub3-uq', 'sub3-avg', 'sub3-iq', 'sub4-min', 'sub4-max', 'sub4-std', 'sub4-lq', 'sub4-uq', 'sub4-avg', 'sub4-iq', 'sub5-min', 'sub5-max', 'sub5-std', 'sub5-lq', 'sub5-uq', 'sub5-avg', 'sub5-iq', 'sub6-min', 'sub6-max', 'sub6-std', 'sub6-lq', 'sub6-uq', 'sub6-avg', 'sub6-iq', 'sub7-min', 'sub7-max', 'sub7-std', 'sub7-lq', 'sub7-uq', 'sub7-avg', 'sub7-iq', 'sub8-min', 'sub8-max', 'sub8-std', 'sub8-lq', 'sub8-uq', 'sub8-avg', 'sub8-iq', 'sub9-min', 'sub9-max', 'sub9-std', 'sub9-lq', 'sub9-uq', 'sub9-avg', 'sub9-iq', 'sub10-min', 'sub10-max', 'sub10-std', 'sub10-lq', 'sub10-uq', 'sub10-avg', 'sub10-iq', 'sub11-min', 'sub11-max', 'sub11-std', 'sub11-lq', 'sub11-uq', 'sub11-avg', 'sub11-iq', 'sub12-min', 'sub12-max', 'sub12-std', 'sub12-lq', 'sub12-uq', 'sub12-avg', 'sub12-iq', 'sub13

In [67]:
concatenated_df_0.columns = new_columns
concatenated_df_1.columns = new_columns
concatenated_df_2.columns = new_columns
concatenated_df_3.columns = new_columns
concatenated_df_4.columns = new_columns
concatenated_df_5.columns = new_columns

print(concatenated_df_4.head(10))

   sub1-min   sub1-max  sub1-std    sub1-lq    sub1-uq   sub1-avg   sub1-iq  \
0  2.913390  32.625252  5.093618   9.307293  17.132122  13.304955  7.824829   
1  5.978798  24.867867  3.413257  14.380052  18.812234  16.419621  4.432182   
2  7.524392  32.421394  3.346505  15.987507  19.809855  17.779024  3.822348   
3  2.753320  32.745533  4.934467  12.414617  18.495861  15.284134  6.081244   
4  1.130259  25.958572  4.973228   9.027493  16.112347  12.516314  7.084854   
5  1.130259  24.539482  4.236032   9.182666  14.582811  12.027728  5.400145   
6  3.057897  25.825258  4.274248   9.440348  14.927962  12.381883  5.487613   
7  4.261752  25.825258  3.395047  12.320968  16.377016  14.474627  4.056048   
8  5.225776  25.219793  4.203624  10.842809  16.714370  14.070573  5.871561   
9  2.147730  23.553516  3.885442   8.428587  13.054323  11.105939  4.625736   

   sub2-min   sub2-max  sub2-std  ...  sub51-avg  sub51-iq  sub52-min  \
0  3.307907  31.686935  4.802752  ...  11.675468  4.08922

In [69]:
concatenated_df_0.to_csv("/content/drive/MyDrive/01.School_related/DoAnChuyenNganh/dataset/features/features-0.csv", index=None)
concatenated_df_1.to_csv("/content/drive/MyDrive/01.School_related/DoAnChuyenNganh/dataset/features/features-1.csv", index=None)
concatenated_df_2.to_csv("/content/drive/MyDrive/01.School_related/DoAnChuyenNganh/dataset/features/features-2.csv", index=None)
concatenated_df_3.to_csv("/content/drive/MyDrive/01.School_related/DoAnChuyenNganh/dataset/features/features-3.csv", index=None)
concatenated_df_4.to_csv("/content/drive/MyDrive/01.School_related/DoAnChuyenNganh/dataset/features/features-4.csv", index=None)
concatenated_df_5.to_csv("/content/drive/MyDrive/01.School_related/DoAnChuyenNganh/dataset/features/features-5.csv", index=None)