In [1]:
import pandas as pd
from TemporalAbstraction import CategoricalAbstraction
from sklearn.cluster import KMeans
import numpy as np

In [2]:
file_path = '/Users/nag/study/vu_ms_ai/ml4qs/ML4QS_Group125/data_w_features_ver2/data_w_fourier_and_autocorr_0.5.csv'
data = pd.read_csv(file_path)

In [3]:
# Identify columns of interest
columns_of_interest = [
    'amplitude_mean_kalman', 'pitch_mean_kalman', 'sound_intensity_mean_kalman']

In [4]:
# Function to categorize a feature using k-means clustering
def categorize_feature(data, feature, n_clusters=3):
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    data[f'{feature}_categorical'] = kmeans.fit_predict(data[[feature]])
    
    # Assign meaningful category names
    centroids = kmeans.cluster_centers_.flatten()
    sorted_centroids = np.argsort(centroids)
    category_names = ['low', 'medium', 'high']
    mapping = {sorted_centroids[i]: category_names[i] for i in range(n_clusters)}
    
    data[f'{feature}_categorical'] = data[f'{feature}_categorical'].map(mapping)
    return data


In [5]:
for feature in columns_of_interest:
    data = categorize_feature(data, feature)

In [6]:
# Display the number of each category for each column
for feature in columns_of_interest:
    print(data.groupby([f'{feature}_categorical']).size())

amplitude_mean_kalman_categorical
high      3620
low       3047
medium    7472
dtype: int64
pitch_mean_kalman_categorical
high       490
low       8439
medium    5210
dtype: int64
sound_intensity_mean_kalman_categorical
high      4354
low       2390
medium    7395
dtype: int64


In [7]:
# Print out the column names to verify
print(data.columns.tolist())

['time_0.5', 'language', 'tone', 'participant', 'script', 'sound_intensity_max', 'sound_intensity_min', 'sound_intensity_mean', 'sound_intensity_median', 'sound_intensity_std', 'sound_intensity_missing', 'pitch_max', 'pitch_min', 'pitch_mean', 'pitch_median', 'pitch_std', 'pitch_missing', 'amplitude_mean', 'amplitude_max', 'amplitude_min', 'amplitude_median', 'amplitude_std', 'amplitude_missing', 'amplitude_mean_lowpass', 'pitch_mean_lowpass', 'sound_intensity_mean_lowpass', 'amplitude_mean_kalman', 'pitch_mean_kalman', 'sound_intensity_mean_kalman', 'sound_intensity_mean_kalman_max_freq', 'sound_intensity_mean_kalman_freq_weighted', 'sound_intensity_mean_kalman_pse', 'sound_intensity_mean_kalman_freq_0.0_Hz_ws_10', 'sound_intensity_mean_kalman_freq_0.1_Hz_ws_10', 'sound_intensity_mean_kalman_freq_0.2_Hz_ws_10', 'sound_intensity_mean_kalman_freq_0.3_Hz_ws_10', 'sound_intensity_mean_kalman_freq_0.4_Hz_ws_10', 'sound_intensity_mean_kalman_freq_0.5_Hz_ws_10', 'amplitude_mean_kalman_max_fr

In [8]:
#data.to_csv('/Users/macbookair/Desktop/ML4QS_Group125/data_w_features_ver2/data_w_all_features_on_0.5_secs_with_categorical.csv', index=False)

In [8]:
# One-hot encode the categorical columns
categorical_cols = [f'{feature}_categorical' for feature in columns_of_interest]
data = pd.get_dummies(data, columns=categorical_cols, prefix=categorical_cols)


In [9]:
categorical_cols

['amplitude_mean_kalman_categorical',
 'pitch_mean_kalman_categorical',
 'sound_intensity_mean_kalman_categorical']

In [10]:
# Define parameters for the CategoricalAbstraction
cols = [col for col in data.columns if 'categorical' in col]
match = ['exact' for _ in cols]
min_support = 0.1
window_size = 10
max_pattern_size = 3

# Instantiate the CategoricalAbstraction class
ca = CategoricalAbstraction()

# Perform the temporal abstraction on the categorical data
abstracted_data = ca.abstract_categorical(data, cols, match, min_support, window_size, max_pattern_size)
# Define a function to process each group
def process_group(group):
    # Remove the first 10 rows of the group
    group = group.iloc[10:]
    return group

# Apply the process_group function to each group and reset the index
processed_data = abstracted_data.apply(process_group).reset_index(drop=True)


amplitude_mean_kalman_categorical_high
amplitude_mean_kalman_categorical_low
amplitude_mean_kalman_categorical_medium
pitch_mean_kalman_categorical_low
pitch_mean_kalman_categorical_medium
sound_intensity_mean_kalman_categorical_high
sound_intensity_mean_kalman_categorical_low
sound_intensity_mean_kalman_categorical_medium
Number of patterns of size 1 is 8
amplitude_mean_kalman_categorical_high(b)amplitude_mean_kalman_categorical_high
amplitude_mean_kalman_categorical_high(b)amplitude_mean_kalman_categorical_medium
amplitude_mean_kalman_categorical_high(b)pitch_mean_kalman_categorical_low
amplitude_mean_kalman_categorical_high(b)pitch_mean_kalman_categorical_medium
amplitude_mean_kalman_categorical_high(c)pitch_mean_kalman_categorical_medium
amplitude_mean_kalman_categorical_high(b)sound_intensity_mean_kalman_categorical_high
amplitude_mean_kalman_categorical_high(c)sound_intensity_mean_kalman_categorical_high
amplitude_mean_kalman_categorical_high(b)sound_intensity_mean_kalman_categor

In [13]:
processed_data

Unnamed: 0,time_0.5,language,tone,participant,script,sound_intensity_max,sound_intensity_min,sound_intensity_mean,sound_intensity_median,sound_intensity_std,...,temp_pattern_sound_intensity_mean_kalman_categorical_medium(b)amplitude_mean_kalman_categorical_low,temp_pattern_sound_intensity_mean_kalman_categorical_medium(b)amplitude_mean_kalman_categorical_medium,temp_pattern_sound_intensity_mean_kalman_categorical_medium(c)amplitude_mean_kalman_categorical_medium,temp_pattern_sound_intensity_mean_kalman_categorical_medium(b)pitch_mean_kalman_categorical_low,temp_pattern_sound_intensity_mean_kalman_categorical_medium(c)pitch_mean_kalman_categorical_low,temp_pattern_sound_intensity_mean_kalman_categorical_medium(b)pitch_mean_kalman_categorical_medium,temp_pattern_sound_intensity_mean_kalman_categorical_medium(c)pitch_mean_kalman_categorical_medium,temp_pattern_sound_intensity_mean_kalman_categorical_medium(b)sound_intensity_mean_kalman_categorical_high,temp_pattern_sound_intensity_mean_kalman_categorical_medium(b)sound_intensity_mean_kalman_categorical_low,temp_pattern_sound_intensity_mean_kalman_categorical_medium(b)sound_intensity_mean_kalman_categorical_medium
0,5.0,ch,bus,subject1,t1,44.520166,32.332500,38.831527,38.961954,4.027372,...,0,1,1,0,0,1,1,0,0,1
1,5.5,ch,bus,subject1,t1,46.155397,29.116902,35.808502,34.457029,6.467960,...,0,0,0,1,1,0,0,0,0,1
2,6.0,ch,bus,subject1,t1,45.550824,29.670797,36.520156,37.200766,5.327840,...,0,1,1,1,1,0,0,0,0,1
3,6.5,ch,bus,subject1,t1,45.516573,29.934362,37.398395,38.494228,5.381269,...,0,1,1,0,0,1,1,0,0,1
4,7.0,ch,bus,subject1,t1,44.143883,31.525941,37.727902,36.640776,4.988859,...,0,1,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14124,270.0,en,bus,subject3,t4,46.812593,33.201463,41.220897,40.433230,5.284410,...,0,0,0,0,0,1,0,1,0,0
14125,270.5,en,bus,subject3,t4,43.207692,29.849771,36.441250,36.888085,5.700178,...,0,0,0,0,0,1,0,1,0,0
14126,271.0,en,bus,subject3,t4,44.473441,22.855573,33.845266,33.346548,7.452854,...,0,1,1,1,1,0,0,0,0,1
14127,271.5,en,bus,subject3,t4,23.765770,19.912261,21.420265,20.985575,1.446563,...,1,0,0,1,0,0,0,0,1,0


In [14]:
output_path = '/Users/nag/study/vu_ms_ai/ml4qs/ML4QS_Group125/data_w_features_ver2/data_w_categorical_temporal_patterns.csv'
processed_data.to_csv(output_path, index=False)