# Import Bonn data

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

os.chdir('/content/drive/My Drive/0-Project')
folders = ['F', 'O', 'S']
# O: healthy people with eyes closed; F: patients seizure free; S: seizure
dataframes = {}

for folder_path in folders:
  file_list = os.listdir(folder_path)
  data_dict = {}

  for file_name in file_list:
    column_name = file_name[1:4]
    file_path = os.path.join(folder_path, file_name)
    with open(file_path, 'r') as file:
      file_data = [line.strip() for line in file.readlines()]
      data_dict[column_name] = file_data

  df = pd.DataFrame(data_dict)
  df = df[sorted(df.columns)]

  # Store the DataFrame in the dictionary with the folder name as the key
  dataframes[folder_path] = df

# Accessing the DataFrames for F, O, and S
df_F = dataframes['F']
df_O = dataframes['O']
df_S = dataframes['S']
df_F = df_F.apply(pd.to_numeric)
df_O = df_O.apply(pd.to_numeric)
df_S = df_S.apply(pd.to_numeric)

# Representation 1

We separate the data into 4 nibbles, nibble 1 to 4.

Representation:

1. All nibbles are the same, use 000

2. Nibble 1 , 2, 3 are the same,  nibble 4 is different, use 001

3. Nibble 1, 2 are the same, nibble 3,4 are different, use 010

4. Nibble 1 is the same, nibble 2, 3, 4 are different, use 011

5. All nibbles are different, use 100

In [2]:
def compare(str1, str2, split):
  '''
  Params:
    str1  : input string 1
    str2  : input string 2
    split : a list of 4 integers that sum up to 12

  Compare str1 and str2, both are 12-bit binary representation, separate them
  in 4 nibbles, nibble 1, 2, 3, 4 using split
  if all nibbles are all same, we use 00 as prefix
  if nibble 1, 2, 3 are the same and nibble 4 is different, we use 001
  if nibble 1, 2 are the same and nibble 3, 4 are different, we use 010
  if nibble 1 is the same and nibble 2, 3, 4 are different, we use 011
  if all nibbles are different, we use 100
  '''
  # Ensure both strings are 12-bit binary representations
  if len(str1) != 12 or len(str2) != 12:
    raise ValueError("Both strings must be 12-bit binary representations.")

  # Split the strings into 4 nibbles
  nibbles1 = [str1[:split[0]], str1[split[0]:(split[0]+split[1])],
              str1[(split[0]+split[1]):(split[0]+split[1]+split[2])],
              str1[(split[0]+split[1]+split[2]):]]
  nibbles2 = [str2[:split[0]], str2[split[0]:(split[0]+split[1])],
              str2[(split[0]+split[1]):(split[0]+split[1]+split[2])],
              str2[(split[0]+split[1]+split[2]):]]

  # Compare the nibbles
  if nibbles1 == nibbles2:
    return '000'
  elif nibbles1[0] == nibbles2[0] and nibbles1[1] == nibbles2[1] and nibbles1[2] == nibbles2[2]:
    return '001'
  elif nibbles1[0] == nibbles2[0] and nibbles1[1] == nibbles2[1]:
    return '010'
  elif nibbles1[0] == nibbles2[0]:
    return '011'
  else:
    return '100'

In [3]:
def l2sb(input, split):
  '''
  Params:
    input : a list of 12-bit binary string
    split :  a list of 4 integers that sum up to 12
  Use L2SB algorithm and return a list of compressed result.
  '''
  res = []
  res.append(input[0])
  for i in range(1, len(input)):
    code = compare(input[i], input[i-1], split)
    if code == '100':
      res.append('100' + input[i])
    elif code == '011':
      res.append('011' + input[i][split[0]:])
    elif code == '010':
      res.append('010' + input[i][(split[0]+split[1]):])
    elif code == '001':
      res.append('001' + input[i][(split[0]+split[1]+split[2]):])
    else:
      res.append('000')
  return res


In [4]:
def total_length(strings):
  '''
  Params:
    strings : a list of string
  Return the total length of all strings in input string.
  '''
  return sum(len(s) for s in strings)

In [5]:
def calRatio_all(input, split):
  '''
  Params:
    input : a data frame, each column represents a txt file
    split : an integer, we split the 12 bits into 0-(split-1) and split-12

  Use new l2sb algorithm to compress the input data. Record the length of compressed
  and original data of each column, then sum the compressed and original length
  of all data to get the overall compress ratio.
  '''
  original_length = 0
  compressed_length = 0
  for i in range(input.shape[1]):
    col = input.iloc[:,i]
    binarycol = [np.binary_repr(val, width=12) for val in col]
    col_compressed = l2sb(binarycol, split)
    original_length += total_length(binarycol)
    compressed_length += total_length(col_compressed)
  return original_length / compressed_length

# Test case

In [None]:
split = [3,3,3,3]
input = ['000011001001', '000111101101', '000111001001', '000111001101', '000111001101', '010101101111']
#            1-2-3-4         1 same         1,2 same         1,2,3 same     all same        all different
#            none             011                010            001              000         100
res = l2sb(input, split)
print(res)

['000011001001', '011111101101', '010001001', '001101', '000', '100010101101111']


# Get compression ratio

## Transfer data to given range

Let's assume the data set has a range of [min, max], then for every data point in it, we use the following equation to transfer it into our desired range:

4095 * data[i] / (max - min)

We use 4095 because 2^12 = 4096, a 12 bit binary number is in range [0,4095]

In [None]:
def transfer(data):
  '''
  Params:
    data : input data frame
  First we add offset to input data frame so we have a dataset of non-negative numbers.
  Transfer the data using following equation:
  new_data = 4095 * data[i] / (max - min)
  '''
  new_data = data.copy()
  min = data.min().min()
  if min < 0:
    new_data -= min
  min = new_data.min().min()
  max = new_data.max().max()
  new_data = 4095 * new_data / (max - min)
  new_data = new_data.round().astype(int)
  return new_data

Using function transfer doesn't bring us good results, so we only add offset to Bonn data.

In [None]:
# df_F_new = transfer(df_F)
# df_O_new = transfer(df_O)
# df_S_new = transfer(df_S)

In [9]:
df_F_new = df_F.copy()
df_O_new = df_O.copy()
df_S_new = df_S.copy()

df_F_new -= df_F_new.min().min()
df_O_new -= df_O_new.min().min()
df_S_new -= df_S_new.min().min()

split_list = []
for a in range(1, 10):
  for b in range(1, 11 - a):
    for c in range(1, 12 - a - b):
      d = 12 - a - b - c
      if d > 0:
        split_list.append([a, b, c, d])

#ratio_list = {}
ratio_list_f = {}
#ratio_list_o = {}
ratio_list_s = {}
for split in split_list:
  ratio_f = calRatio_all(df_F_new, split)
  #ratio_o = calRatio_all(df_O_new, split)
  ratio_s = calRatio_all(df_S_new, split)
  ratio_list_f[tuple(split)] = ratio_f
  #ratio_list_o[tuple(split)] = ratio_o
  ratio_list_s[tuple(split)] = ratio_s


In [None]:
import openpyxl
from openpyxl import Workbook

# Create a new workbook and select the active worksheet
wb = Workbook()
ws = wb.active

# Write the headers
ws.append(['Key', 'Value'])

# Write the key-value pairs
for key, value in ratio_list_o.items():
  ws.append([str(key), value])


# Save the workbook to a file
wb.save('O-output-4.xlsx')

In [None]:
import openpyxl
from openpyxl import Workbook

wb = Workbook()
ws_f = wb.active
ws_s = wb.create_sheet(title='Ratio List S')

ws_f.append(['Key', 'Value'])
ws_s.append(['Key', 'Value'])

for key, value in ratio_list_f.items():
  ws_f.append([str(key), value])

for key, value in ratio_list_s.items():
  ws_s.append([str(key), value])

wb.save('4 nibbles.xlsx')


In [None]:
f_ratio = {}
o_ratio = {}
s_ratio = {}

for key, (ratio_f, ratio_o, ratio_s) in ratio_list.items():
  f_ratio[key] = ratio_f
  o_ratio[key] = ratio_o
  s_ratio[key] = ratio_s

In [None]:
top_5_f_ratio = sorted(f_ratio.items(), key=lambda item: item[1], reverse=True)[:5]
print(top_5_f_ratio)

[((4, 3, 2, 3), 1.4915814118447293), ((4, 2, 2, 4), 1.487426580171397), ((5, 2, 2, 3), 1.482615813905238), ((4, 3, 1, 4), 1.4767218960575694), ((3, 3, 2, 4), 1.4695517275392989)]


In [None]:
top_5_o_ratio = sorted(o_ratio.items(), key=lambda item: item[1], reverse=True)[:5]
print(top_5_o_ratio)

[((4, 2, 2, 4), 1.3034072471606273), ((4, 2, 1, 5), 1.296400355769132), ((4, 1, 2, 5), 1.2866755124953873), ((4, 2, 3, 3), 1.2846199914086922), ((3, 2, 2, 5), 1.2772222209232817)]


In [None]:
top_5_s_ratio = sorted(s_ratio.items(), key=lambda item: item[1], reverse=True)[:5]
print(top_5_s_ratio)

[((3, 2, 2, 5), 1.1260046987861851), ((3, 2, 3, 4), 1.1184994216854751), ((3, 2, 1, 6), 1.1147748251900473), ((2, 2, 2, 6), 1.1137152065936724), ((3, 1, 2, 6), 1.1136299388957749)]


# Import MIT data

In [6]:
os.chdir('/content/drive/My Drive/0-Project/MIT')
# 48 csv files
csv_files = [f for f in os.listdir() if f.endswith('.csv')]

df_mix = pd.DataFrame()

for file in csv_files:
  file_path = os.path.join(os.getcwd(), file)

  df = pd.read_csv(file_path, skiprows=1)

  df_last_two_cols = df.iloc[:, -2:]

  file_prefix = file[:3]
  df_last_two_cols.columns = [f"{file_prefix}-1", f"{file_prefix}-2"]

  df_mix = pd.concat([df_mix, df_last_two_cols], axis=1)


In [7]:
# scale the data
#df_mix_scaled = transfer(df_mix)
df_mix_scaled = df_mix.mul(600)
# round the scaled result into integers
df_mix_scaled = df_mix_scaled.round().astype(int)
scaled_min = df_mix_scaled.min().min()
# add offset to all data, make them non-negative
df_mix_scaled = df_mix_scaled.add(abs(scaled_min))

# choose 1,3,5.. columns in df_mixed_scaled as mixed_signal_1
mixed_signal_1 = df_mix_scaled.iloc[:,1::2]
# choose 0,2,4.. columns in df_mixed_scaled as mixed_signal_2
mixed_signal_2 = df_mix_scaled.iloc[:,::2]

In [10]:
mit_ratio_list_1 = {}
mit_ratio_list_2 = {}
for split in split_list:
  ratio_1 = calRatio_all(mixed_signal_1, split)
  ratio_2 = calRatio_all(mixed_signal_2, split)
  mit_ratio_list_1[tuple(split)] = ratio_1
  mit_ratio_list_2[tuple(split)] = ratio_2

In [None]:
top_5_mix_ratio_1 = sorted(mit_ratio_list_1.items(), key=lambda item: item[1], reverse=True)[:5]
top_5_mix_ratio_2 = sorted(mit_ratio_list_2.items(), key=lambda item: item[1], reverse=True)[:5]
print(top_5_mix_ratio_1)
print(top_5_mix_ratio_2)

[((4, 2, 2, 4), 1.5070775080892036), ((5, 2, 1, 4), 1.5066033780872619), ((5, 2, 2, 3), 1.5049970605526162), ((3, 3, 2, 4), 1.4971235820067925), ((4, 3, 1, 4), 1.4961373594936107)]
[((4, 2, 2, 4), 1.4925867328502964), ((3, 3, 2, 4), 1.4892176077574908), ((5, 2, 1, 4), 1.4833053043758795), ((4, 3, 1, 4), 1.483010390942308), ((5, 2, 2, 3), 1.4823923020831844)]


In [11]:
import openpyxl
from openpyxl import Workbook

wb = Workbook()
ws_1 = wb.active
ws_1.title = 'Ratio List 1'
ws_2 = wb.create_sheet(title='Ratio List 2')

ws_1.append(['Key', 'Value'])
ws_2.append(['Key', 'Value'])

for key, value in mit_ratio_list_1.items():
  ws_1.append([str(key), value])

for key, value in mit_ratio_list_2.items():
  ws_2.append([str(key), value])

wb.save('4 nibbles MIT.xlsx')