In [1]:
import pandas as pd
from TemporalAbstraction import CategoricalAbstraction
from sklearn.cluster import KMeans
import numpy as np

In [2]:
file_path = '/Users/macbookair/Desktop/ML4QS_Group125/data_w_features_ver2/data_w_fourier_and_autocorr_0.5.csv'
data = pd.read_csv(file_path)

In [3]:
# Identify columns of interest
columns_of_interest = [
    'amplitude_mean_kalman', 'pitch_mean_kalman', 'sound_intensity_mean_kalman']

In [4]:
# Function to categorize a feature using k-means clustering
def categorize_feature(data, n_clusters=3):
    # Identify columns of interest
    columns_of_interest = ['amplitude_mean_kalman', 'pitch_mean_kalman', 'sound_intensity_mean_kalman']
    for feature in columns_of_interest:
        kmeans = KMeans(n_clusters=n_clusters, random_state=0)
        data[f'{feature}_categorical'] = kmeans.fit_predict(data[[feature]])
        
        # Assign meaningful category names
        centroids = kmeans.cluster_centers_.flatten()
        sorted_centroids = np.argsort(centroids)
        category_names = ['low', 'medium', 'high']
        mapping = {sorted_centroids[i]: category_names[i] for i in range(n_clusters)}
        
        data[f'{feature}_categorical'] = data[f'{feature}_categorical'].map(mapping)
    return data


In [5]:
# Group the data by language, tone, participant, and script
grouped_data = data.groupby(['language', 'tone', 'participant', 'script'])

# Define a function to process each group
def process_group(group):
    # Remove the first 10 rows of the group
    group = group.iloc[3:]
    return group

# Apply the process_group function to each group and reset the index
processed_data = grouped_data.apply(categorize_feature).reset_index(drop=True)

# Merge the processed data back into the original data
data = pd.concat([data[['language', 'tone', 'participant', 'script']], processed_data], axis=1)


In [6]:

# Display the number of each category for each column
for feature in columns_of_interest:
    print(data.groupby([f'{feature}_categorical']).size())  

amplitude_mean_kalman_categorical
high      5739
low       2421
medium    5979
dtype: int64
pitch_mean_kalman_categorical
high       911
low       7811
medium    5417
dtype: int64
sound_intensity_mean_kalman_categorical
high      6951
low       1568
medium    5620
dtype: int64


In [7]:
# Print out the column names to verify
print(data.columns.tolist())

['language', 'tone', 'participant', 'script', 'time_0.5', 'language', 'tone', 'participant', 'script', 'sound_intensity_max', 'sound_intensity_min', 'sound_intensity_mean', 'sound_intensity_median', 'sound_intensity_std', 'sound_intensity_missing', 'pitch_max', 'pitch_min', 'pitch_mean', 'pitch_median', 'pitch_std', 'pitch_missing', 'amplitude_mean', 'amplitude_max', 'amplitude_min', 'amplitude_median', 'amplitude_std', 'amplitude_missing', 'amplitude_mean_lowpass', 'pitch_mean_lowpass', 'sound_intensity_mean_lowpass', 'amplitude_mean_kalman', 'pitch_mean_kalman', 'sound_intensity_mean_kalman', 'sound_intensity_mean_kalman_max_freq', 'sound_intensity_mean_kalman_freq_weighted', 'sound_intensity_mean_kalman_pse', 'sound_intensity_mean_kalman_freq_0.0_Hz_ws_10', 'sound_intensity_mean_kalman_freq_0.1_Hz_ws_10', 'sound_intensity_mean_kalman_freq_0.2_Hz_ws_10', 'sound_intensity_mean_kalman_freq_0.3_Hz_ws_10', 'sound_intensity_mean_kalman_freq_0.4_Hz_ws_10', 'sound_intensity_mean_kalman_freq

In [8]:
data.to_csv('/Users/macbookair/Desktop/ML4QS_Group125/data_w_features_ver2/data_w_all_features_on_0.5_secs_with_categorical.csv', index=False)

In [9]:
# One-hot encode the categorical columns
categorical_cols = [f'{feature}_categorical' for feature in columns_of_interest]
data = pd.get_dummies(data, columns=categorical_cols, prefix=categorical_cols)



In [10]:
# Define parameters for the CategoricalAbstraction
cols = [col for col in data.columns if 'categorical' in col]
match = ['exact' for _ in cols]
min_support = 0.1
window_size = 3
max_pattern_size = 3

# Instantiate the CategoricalAbstraction class
ca = CategoricalAbstraction()

# Perform the temporal abstraction on the categorical data
abstracted_data = ca.abstract_categorical(data, cols, match, min_support, window_size, max_pattern_size)
abstracted_data = data.groupby(['language', 'tone', 'participant', 'script'])
 

# Save the resulting dataset with abstracted patterns
output_path = '/Users/macbookair/Desktop/ML4QS_Group125/data_w_features_ver2/data_w_temporal_patterns.csv'
abstracted_data.to_csv(output_path, index=False)

abstracted_data.head()


amplitude_mean_kalman_categorical_high
amplitude_mean_kalman_categorical_low
amplitude_mean_kalman_categorical_medium
pitch_mean_kalman_categorical_low
pitch_mean_kalman_categorical_medium
sound_intensity_mean_kalman_categorical_high
sound_intensity_mean_kalman_categorical_low
sound_intensity_mean_kalman_categorical_medium
Number of patterns of size 1 is 8
amplitude_mean_kalman_categorical_high(b)amplitude_mean_kalman_categorical_high
amplitude_mean_kalman_categorical_high(b)amplitude_mean_kalman_categorical_medium
amplitude_mean_kalman_categorical_high(b)pitch_mean_kalman_categorical_low
amplitude_mean_kalman_categorical_high(c)pitch_mean_kalman_categorical_low
amplitude_mean_kalman_categorical_high(b)pitch_mean_kalman_categorical_medium
amplitude_mean_kalman_categorical_high(c)pitch_mean_kalman_categorical_medium
amplitude_mean_kalman_categorical_high(b)sound_intensity_mean_kalman_categorical_high
amplitude_mean_kalman_categorical_high(c)sound_intensity_mean_kalman_categorical_high
a

ValueError: Grouper for 'language' not 1-dimensional