# What does the data mean?

According to the paper, data was collected from 24 person and was one-hour long. One person has two columns of data, representing non-seizure signals and a seizure event.

We have 48 csv files. In each file, we have two columns of data, each column's data is mixed, it contains seizure data and non-seizure data.

# Import and check data

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/0-Project/MIT')

Mounted at /content/drive


In [3]:
# 48 csv files
csv_files = [f for f in os.listdir() if f.endswith('.csv')]

df_mix = pd.DataFrame()

for file in csv_files:
  file_path = os.path.join(os.getcwd(), file)

  df = pd.read_csv(file_path, skiprows=1)

  df_last_two_cols = df.iloc[:, -2:]

  file_prefix = file[:3]
  df_last_two_cols.columns = [f"{file_prefix}-1", f"{file_prefix}-2"]

  df_mix = pd.concat([df_mix, df_last_two_cols], axis=1)


In [None]:
# check the min and max value of data
df_mix.info()
# no null object

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600 entries, 0 to 3599
Data columns (total 96 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   219-1   3600 non-null   float64
 1   219-2   3600 non-null   float64
 2   214-1   3600 non-null   float64
 3   214-2   3600 non-null   float64
 4   210-1   3600 non-null   float64
 5   210-2   3600 non-null   float64
 6   124-1   3600 non-null   float64
 7   124-2   3600 non-null   float64
 8   233-1   3600 non-null   float64
 9   233-2   3600 non-null   float64
 10  203-1   3600 non-null   float64
 11  203-2   3600 non-null   float64
 12  207-1   3600 non-null   float64
 13  207-2   3600 non-null   float64
 14  222-1   3600 non-null   float64
 15  222-2   3600 non-null   float64
 16  217-1   3600 non-null   float64
 17  217-2   3600 non-null   float64
 18  117-1   3600 non-null   float64
 19  117-2   3600 non-null   float64
 20  208-1   3600 non-null   float64
 21  208-2   3600 non-null   float64
 22  

In [None]:
df_mix.values.max()

2.51

In [None]:
df_mix.values.min()

-3.28

Values are 3-digit float, ranging from 2.51 to -3.28. To convert it to a 12-digit binary representation, we can multiply it by a scale factor so we have integer numbers.

In [4]:
def count_decimal_places(value):
  # Convert the number to a string and split on the decimal point
  str_value = str(value)
  if '.' in str_value:
    return len(str_value.split('.')[1])
  else:
    return 0

max_decimal_places = {col: 0 for col in df_mix.columns}

for col in df_mix.columns:
  max_decimal_places[col] = df_mix[col].apply(count_decimal_places).max()

In [None]:
print(max(max_decimal_places.values()))

3


# Preprocess

In MIT data, we have at most 3 decimal digits. We can define our own method to represent the data in 12-digit.

**Q: In physioNet, what is the usual decimal digit for seizure data?**

We can use a scale factor of 600 to transfer the original data into an integer.

In [None]:
-2048/-3.28

624.390243902439

In [None]:
2047/2.51

815.5378486055778

We choose 600 as our scaling factor.

In [5]:
# first we scale all data
df_mix_scaled = df_mix.mul(600)
# round the scaled result into integers
df_mix_scaled = df_mix_scaled.round().astype(int)
scaled_min = df_mix_scaled.min().min()
# add offset to all data, make them non-negative
df_mix_scaled = df_mix_scaled.add(abs(scaled_min))

# Apply L2SB algorithm

In [6]:
def compare(str1, str2):
  '''
  Compare str1 and str2, both are 12-bit binary representation, separate them
  in 3 nibbles, nibble 1, 2, 3.
  If all nibbles are different, return '11'
  If nibble 1 is the same, return '10'
  If nibble 1 and 2 are the same, return '01'
  If all nibbles are the same, return '00'
  '''
  # Ensure both strings are 12-bit binary representations
  if len(str1) != 12 or len(str2) != 12:
    raise ValueError("Both strings must be 12-bit binary representations.")

  # Split the strings into 3 nibbles
  nibbles1 = [str1[:4], str1[4:8], str1[8:]]
  nibbles2 = [str2[:4], str2[4:8], str2[8:]]

  # Compare the nibbles
  if nibbles1 == nibbles2:
    return '00'
  elif nibbles1[0] == nibbles2[0] and nibbles1[1] == nibbles2[1]:
    return '01'
  elif nibbles1[0] == nibbles2[0]:
    return '10'
  else:
    return '11'

In [7]:
def l2sb(input):
  '''
  Params:
    input :  a list of 12-bit binary string
  Use L2SB algorithm and return a list of compressed result.
  '''
  res = []
  res.append(input[0])
  for i in range(1, len(input)):
    if compare(input[i], input[i-1]) == '11':
      res.append('11' + input[i])
    elif compare(input[i], input[i-1]) == '10':
      res.append('10' + input[i][4:])
    elif compare(input[i], input[i-1]) == '01':
      res.append('01' + input[i][8:])
    else:
      res.append('00')
  return res


In [8]:
def total_length(strings):
  '''
  Params:
    strings : a list of string
  Return the total length of all strings in input string.
  '''
  return sum(len(s) for s in strings)

In [9]:
def calRatio_avg(input):
  '''
  Params:
    input : a data frame, each column represents a txt file

  Use l2sb algorithm to compress the input data. Calculate compress ratio of each
  column, then calculate the average ratio of all columns as the final result.
  '''
  ratios = []
  for i in range(input.shape[1]):
    col = input.iloc[:,i]
    binarycol = [np.binary_repr(val, width=12) for val in col]
    col_compressed = l2sb(binarycol)
    ratios.append(total_length(binarycol) / total_length(col_compressed))
  return sum(ratios)/len(ratios), ratios

def calRatio_all(input):
  '''
  Params:
    input : a data frame, each column represents a txt file

  Use l2sb algorithm to compress the input data. Record the length of compressed
  and original data of each column, then sum the compressed and original length
  of all data to get the overall compress ratio.
  '''
  original_length = 0
  compressed_length = 0
  for i in range(input.shape[1]):
    col = input.iloc[:,i]
    binarycol = [np.binary_repr(val, width=12) for val in col]
    col_compressed = l2sb(binarycol)
    original_length += total_length(binarycol)
    compressed_length += total_length(col_compressed)
  return original_length / compressed_length


In [None]:
calRatio_all(df_mix_scaled)

1.5784161337025642

In [10]:
# choose 1,3,5.. columns in df_mixed_scaled as mixed_signal_1
mixed_signal_1 = df_mix_scaled.iloc[:,1::2]
# choose 0,2,4.. columns in df_mixed_scaled as mixed_signal_2
mixed_signal_2 = df_mix_scaled.iloc[:,::2]

In [None]:
calRatio_all(mixed_signal_1)

1.5839744804111489

In [None]:
calRatio_all(mixed_signal_2)

1.5728966603253203

# Scale the data

Let's assume the data set has a range of [min, max], then for every data point in it, we use the following equation to transfer it into our desired range:

4095 * data[i] / (max - min)

We use 4095 because 2^12 = 4096, a 12 bit binary number is in range [0,4095]

In [None]:
def transfer(data):
  '''
  Params:
    data : input data frame
  First we add offset to input data frame so we have a dataset of non-negative numbers.
  Transfer the data using following equation:
  new_data = 4095 * data[i] / (max - min)
  '''
  new_data = data.copy()
  min = data.min().min()
  if min < 0:
    new_data -= min
  min = new_data.min().min()
  max = new_data.max().max()
  new_data = 4095 * new_data / (max - min)
  new_data = new_data.round().astype(int)
  return new_data

In [None]:
# scale the data
df_mix_scaled = transfer(df_mix)
# choose 1,3,5.. columns in df_mixed_scaled as mixed_signal_1
mixed_signal_1 = df_mix_scaled.iloc[:,1::2]
# choose 0,2,4.. columns in df_mixed_scaled as mixed_signal_2
mixed_signal_2 = df_mix_scaled.iloc[:,::2]

In [None]:
calRatio_all(mixed_signal_1)

1.5489098769895455

In [None]:
calRatio_all(mixed_signal_2)

1.5301198062556634

# Explore 3 nibbles combination

Previously we separate the data into 4-4-4 nibbles, then we explore all possible combinations.

We use 600 as our scaling factor.

In [11]:
def compare_new(str1, str2, split):
  '''
  Params:
    str1  : input string 1
    str2  : input string 2
    split : a list of 3 integers sum up to 12

  Compare str1 and str2, both are 12-bit binary representation, separate them
  in 3 nibbles, nibble 1, 2, 3 using split
  If all nibbles are different, return '11'
  If nibble 1 is the same, return '10'
  If nibble 1 and 2 are the same, return '01'
  If all nibbles are the same, return '00'
  '''
  # Ensure both strings are 12-bit binary representations
  if len(str1) != 12 or len(str2) != 12:
    raise ValueError("Both strings must be 12-bit binary representations.")

  # Split the strings into 3 nibbles
  nibbles1 = [str1[:split[0]], str1[split[0]:(split[1] + split[0])], str1[(split[1] + split[0]):]]
  nibbles2 = [str2[:split[0]], str2[split[0]:(split[1] + split[0])], str2[(split[1] + split[0]):]]

  # Compare the nibbles
  if nibbles1 == nibbles2:
    return '00'
  elif nibbles1[0] == nibbles2[0] and nibbles1[1] == nibbles2[1]:
    return '01'
  elif nibbles1[0] == nibbles2[0]:
    return '10'
  else:
    return '11'

In [12]:
def l2sb_new(input, split):
  '''
  Params:
    input : a list of 12-bit binary string
    split : a list of 3 integers sum up to 12
  Use L2SB algorithm and return a list of compressed result.
  '''
  res = []
  res.append(input[0])
  for i in range(1, len(input)):
    code = compare_new(input[i], input[i-1], split)
    if code == '11':
      res.append('11' + input[i])
    elif code == '10':
      res.append('10' + input[i][split[0]:])
    elif code == '01':
      res.append('01' + input[i][(split[1] + split[0]):])
    else:
      res.append('00')
  return res


In [13]:
def calRatio_all_new(input, split):
  '''
  Params:
    input : a data frame, each column represents a txt file
    split : a list of 3 integers sum up to 12

  Use new l2sb algorithm to compress the input data. Record the length of compressed
  and original data of each column, then sum the compressed and original length
  of all data to get the overall compress ratio.
  '''
  original_length = 0
  compressed_length = 0
  for i in range(input.shape[1]):
    col = input.iloc[:,i]
    binarycol = [np.binary_repr(val, width=12) for val in col]
    col_compressed = l2sb_new(binarycol, split)
    original_length += total_length(binarycol)
    compressed_length += total_length(col_compressed)
  return original_length / compressed_length


In [14]:
# Get all possible combinations of splitting
combinations = []
for x in range(1, 11):
  for y in range(1, 12-x):
    z = 12 - x - y
    if z > 0:
      combinations.append([x, y, z])

In [15]:
mixed_signal_1_dict = {}
mixed_signal_2_dict = {}
for split in combinations:
  ratio_1 = calRatio_all_new(mixed_signal_1, split)
  mixed_signal_1_dict[tuple(split)] = ratio_1
  ratio_2 = calRatio_all_new(mixed_signal_2, split)
  mixed_signal_2_dict[tuple(split)] = ratio_2

In [None]:
signal1_top5 = dict(sorted(mixed_signal_1_dict.items(), key=lambda item: item[1], reverse=True)[:5])
print(signal1_top5)

{(5, 3, 4): 1.6359518082174704, (5, 2, 5): 1.6284961698611977, (6, 2, 4): 1.6228196491403815, (4, 3, 5): 1.616274991231147, (4, 4, 4): 1.5839744804111489}


In [None]:
signal2_top5 = dict(sorted(mixed_signal_2_dict.items(), key=lambda item: item[1], reverse=True)[:5])
print(signal2_top5)

{(5, 3, 4): 1.6110573729416016, (5, 2, 5): 1.6032411201657673, (4, 3, 5): 1.6028965924533378, (6, 2, 4): 1.5929374855002436, (3, 4, 5): 1.5747732504228178}


for mixed signal, the best combination is (5,3,4), while (4,3,5) also has good performance. This result is obtained using 600 as scaling factor.

In [16]:
import openpyxl
from openpyxl import Workbook

wb = Workbook()
ws_1 = wb.active
ws_1.title = 'Ratio List 1'
ws_2 = wb.create_sheet(title='Ratio List 2')

ws_1.append(['Key', 'Value'])
ws_2.append(['Key', 'Value'])

for key, value in mixed_signal_1_dict.items():
  ws_1.append([str(key), value])

for key, value in mixed_signal_2_dict.items():
  ws_2.append([str(key), value])

wb.save('3 nibbles MIT.xlsx')