In [53]:
import pandas as pd
from TemporalAbstraction import CategoricalAbstraction
from sklearn.cluster import KMeans
import numpy as np

In [54]:
# Load the dataset
file_path = '/Users/macbookair/Desktop/ML4QS_Group125/data_w_features/data_w_all_features_on_10_secs.csv'
data = pd.read_csv(file_path)

In [55]:
# Identify columns of interest
columns_of_interest = [
    'amplitude_mean_kalman_mean', 'pitch_mean_kalman_mean', 'sound_intensity_mean_kalman_mean']

In [56]:
# Function to categorize a feature using k-means clustering
def categorize_feature(data, feature, n_clusters=3):
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    data[f'{feature}_categorical'] = kmeans.fit_predict(data[[feature]])
    
    # Assign meaningful category names
    centroids = kmeans.cluster_centers_.flatten()
    sorted_centroids = np.argsort(centroids)
    category_names = ['low', 'medium', 'high']
    mapping = {sorted_centroids[i]: category_names[i] for i in range(n_clusters)}
    
    data[f'{feature}_categorical'] = data[f'{feature}_categorical'].map(mapping)
    return data


In [57]:
# Apply categorization to each feature of interest
for feature in columns_of_interest:
    data = categorize_feature(data, feature)


In [58]:

# Display the number of each category for each column
for feature in columns_of_interest:
    print(data.groupby([f'{feature}_categorical']).size())  

amplitude_mean_kalman_mean_categorical
high      172
low       242
medium    295
dtype: int64
pitch_mean_kalman_mean_categorical
high       79
low       327
medium    303
dtype: int64
sound_intensity_mean_kalman_mean_categorical
high      193
low       153
medium    363
dtype: int64


In [59]:
# Print out the column names to verify
print(data.columns.tolist())

['amplitude_max_max', 'pitch_max_max', 'sound_intensity_max_max', 'amplitude_min_min', 'pitch_min_min', 'sound_intensity_min_min', 'amplitude_mean_kalman_mean', 'amplitude_mean_kalman_median', 'amplitude_mean_kalman_std', 'pitch_mean_kalman_mean', 'pitch_mean_kalman_median', 'pitch_mean_kalman_std', 'sound_intensity_mean_kalman_mean', 'sound_intensity_mean_kalman_median', 'sound_intensity_mean_kalman_std', 'sound_intensity_mean_kalman_max_freq_mean', 'sound_intensity_mean_kalman_freq_weighted_mean', 'sound_intensity_mean_kalman_freq_0.0_Hz_ws_10_mean', 'sound_intensity_mean_kalman_freq_0.1_Hz_ws_10_mean', 'sound_intensity_mean_kalman_freq_0.2_Hz_ws_10_mean', 'sound_intensity_mean_kalman_freq_0.3_Hz_ws_10_mean', 'sound_intensity_mean_kalman_freq_0.4_Hz_ws_10_mean', 'sound_intensity_mean_kalman_freq_0.5_Hz_ws_10_mean', 'amplitude_mean_kalman_max_freq_mean', 'amplitude_mean_kalman_freq_weighted_mean', 'amplitude_mean_kalman_freq_0.0_Hz_ws_10_mean', 'amplitude_mean_kalman_freq_0.1_Hz_ws_10

In [60]:
data.to_csv('/Users/macbookair/Desktop/ML4QS_Group125/data_w_features/data_w_all_features_on_10_secs_with_categorical.csv', index=False)

In [61]:
# One-hot encode the categorical columns
categorical_cols = [f'{feature}_categorical' for feature in columns_of_interest]
data = pd.get_dummies(data, columns=categorical_cols, prefix=categorical_cols)



In [62]:
# Define parameters for the CategoricalAbstraction
cols = [col for col in data.columns if 'categorical' in col]
match = ['exact' for _ in cols]
min_support = 0.1
window_size = 3
max_pattern_size = 3

# Instantiate the CategoricalAbstraction class
ca = CategoricalAbstraction()

# Perform the temporal abstraction on the categorical data
abstracted_data = ca.abstract_categorical(data, cols, match, min_support, window_size, max_pattern_size)

 

# Save the resulting dataset with abstracted patterns
output_path = '/Users/macbookair/Desktop/ML4QS_Group125/data_w_features/data_w_temporal_patterns.csv'
abstracted_data.to_csv(output_path, index=False)

abstracted_data.head()


amplitude_mean_kalman_mean_categorical_high
amplitude_mean_kalman_mean_categorical_low
amplitude_mean_kalman_mean_categorical_medium
pitch_mean_kalman_mean_categorical_high
pitch_mean_kalman_mean_categorical_low
pitch_mean_kalman_mean_categorical_medium
sound_intensity_mean_kalman_mean_categorical_high
sound_intensity_mean_kalman_mean_categorical_low
sound_intensity_mean_kalman_mean_categorical_medium
Number of patterns of size 1 is 9
amplitude_mean_kalman_mean_categorical_high(b)amplitude_mean_kalman_mean_categorical_high
amplitude_mean_kalman_mean_categorical_high(b)amplitude_mean_kalman_mean_categorical_medium
amplitude_mean_kalman_mean_categorical_high(b)pitch_mean_kalman_mean_categorical_low
amplitude_mean_kalman_mean_categorical_high(b)pitch_mean_kalman_mean_categorical_medium
amplitude_mean_kalman_mean_categorical_high(c)pitch_mean_kalman_mean_categorical_medium
amplitude_mean_kalman_mean_categorical_high(b)sound_intensity_mean_kalman_mean_categorical_high
amplitude_mean_kalman_

Unnamed: 0,amplitude_max_max,pitch_max_max,sound_intensity_max_max,amplitude_min_min,pitch_min_min,sound_intensity_min_min,amplitude_mean_kalman_mean,amplitude_mean_kalman_median,amplitude_mean_kalman_std,pitch_mean_kalman_mean,...,temp_pattern_sound_intensity_mean_kalman_mean_categorical_medium(c)amplitude_mean_kalman_mean_categorical_low,temp_pattern_sound_intensity_mean_kalman_mean_categorical_medium(b)amplitude_mean_kalman_mean_categorical_medium,temp_pattern_sound_intensity_mean_kalman_mean_categorical_medium(c)amplitude_mean_kalman_mean_categorical_medium,temp_pattern_sound_intensity_mean_kalman_mean_categorical_medium(b)pitch_mean_kalman_mean_categorical_low,temp_pattern_sound_intensity_mean_kalman_mean_categorical_medium(c)pitch_mean_kalman_mean_categorical_low,temp_pattern_sound_intensity_mean_kalman_mean_categorical_medium(b)pitch_mean_kalman_mean_categorical_medium,temp_pattern_sound_intensity_mean_kalman_mean_categorical_medium(c)pitch_mean_kalman_mean_categorical_medium,temp_pattern_sound_intensity_mean_kalman_mean_categorical_medium(b)sound_intensity_mean_kalman_mean_categorical_high,temp_pattern_sound_intensity_mean_kalman_mean_categorical_medium(b)sound_intensity_mean_kalman_mean_categorical_low,temp_pattern_sound_intensity_mean_kalman_mean_categorical_medium(b)sound_intensity_mean_kalman_mean_categorical_medium
0,-38.877735,3576.09375,48.890439,-69.445832,0.0,17.70852,-48.693475,-49.075474,2.917551,124.731984,...,0,0,0,0,0,0,0,0,0,0
1,-38.857052,3329.6875,49.471297,-70.190956,0.0,17.558749,-52.416906,-52.003896,1.661057,180.817376,...,0,0,0,0,0,0,0,0,0,0
2,-41.00591,386.875,49.068143,-69.700578,0.0,15.797836,-53.23728,-52.438886,1.797167,106.026268,...,0,0,0,0,0,0,0,0,0,0
3,-39.537541,3735.46875,49.464572,-70.333515,0.0,14.771213,-52.085849,-51.899239,1.359359,152.485909,...,0,0,0,0,0,0,0,0,0,0
4,-39.110809,463.738839,49.638964,-71.337295,0.0,14.771213,-53.236981,-53.220601,1.489945,132.459165,...,0,0,0,0,0,0,0,0,0,0
