In [36]:
import pandas as pd
from TemporalAbstraction import CategoricalAbstraction
from sklearn.cluster import KMeans
import numpy as np

In [37]:
# Load the dataset
file_path = '//Users/macbookair/Desktop/ML4QS_Group125/data_w_features/data_w_all_features_0.5.csv'
data = pd.read_csv(file_path)

In [38]:
# Identify columns of interest
columns_of_interest = [
    'amplitude_mean_kalman', 'pitch_mean_kalman', 'sound_intensity_mean_kalman']

In [39]:
# Function to categorize a feature using k-means clustering
def categorize_feature(data, feature, n_clusters=3):
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    data[f'{feature}_categorical'] = kmeans.fit_predict(data[[feature]])
    
    # Assign meaningful category names
    centroids = kmeans.cluster_centers_.flatten()
    sorted_centroids = np.argsort(centroids)
    category_names = ['low', 'medium', 'high']
    mapping = {sorted_centroids[i]: category_names[i] for i in range(n_clusters)}
    
    data[f'{feature}_categorical'] = data[f'{feature}_categorical'].map(mapping)
    return data


In [40]:
# Apply categorization to each feature of interest
for feature in columns_of_interest:
    data = categorize_feature(data, feature)


In [41]:

# Display the number of each category for each column
for feature in columns_of_interest:
    print(data.groupby([f'{feature}_categorical']).size())  

amplitude_mean_kalman_categorical
high      3620
low       3047
medium    7472
dtype: int64
pitch_mean_kalman_categorical
high       490
low       8439
medium    5210
dtype: int64
sound_intensity_mean_kalman_categorical
high      4354
low       2390
medium    7395
dtype: int64


In [31]:
# Print out the column names to verify
print(data.columns.tolist())

['time_0.5', 'language', 'tone', 'participant', 'script', 'sound_intensity_max', 'sound_intensity_min', 'sound_intensity_mean', 'sound_intensity_median', 'sound_intensity_std', 'sound_intensity_missing', 'pitch_max', 'pitch_min', 'pitch_mean', 'pitch_median', 'pitch_std', 'pitch_missing', 'amplitude_mean', 'amplitude_max', 'amplitude_min', 'amplitude_median', 'amplitude_std', 'amplitude_missing', 'amplitude_mean_lowpass', 'pitch_mean_lowpass', 'sound_intensity_mean_lowpass', 'amplitude_mean_kalman', 'pitch_mean_kalman', 'sound_intensity_mean_kalman', 'sound_intensity_mean_kalman_max_freq', 'sound_intensity_mean_kalman_freq_weighted', 'sound_intensity_mean_kalman_pse', 'sound_intensity_mean_kalman_freq_0.0_Hz_ws_10', 'sound_intensity_mean_kalman_freq_0.1_Hz_ws_10', 'sound_intensity_mean_kalman_freq_0.2_Hz_ws_10', 'sound_intensity_mean_kalman_freq_0.3_Hz_ws_10', 'sound_intensity_mean_kalman_freq_0.4_Hz_ws_10', 'sound_intensity_mean_kalman_freq_0.5_Hz_ws_10', 'amplitude_mean_kalman_max_fr

In [32]:
data.to_csv('/Users/macbookair/Desktop/ML4QS_Group125/data_w_features/data_w_all_features_on_0.5_secs_with_categorical.csv', index=False)

In [42]:
# One-hot encode the categorical columns
categorical_cols = [f'{feature}_categorical' for feature in columns_of_interest]
data = pd.get_dummies(data, columns=categorical_cols, prefix=categorical_cols)



In [44]:
# Define parameters for the CategoricalAbstraction
cols = [col for col in data.columns if 'categorical' in col]
match = ['exact' for _ in cols]
min_support = 0.1
window_size = 10
max_pattern_size = 1

# Instantiate the CategoricalAbstraction class
ca = CategoricalAbstraction()

# Extract unique trials
trials = data[['language', 'tone', 'participant', 'script']].drop_duplicates()
abstracted_data = pd.DataFrame()

# Loop over each unique trial
for trial in trials.itertuples(index=False, name=None):
    language, tone, participant, script = trial
    
    # Select data for the current trial
    trial_data = data[(data['language'] == language) & 
                     (data['tone'] == tone) & 
                     (data['participant'] == participant) & 
                     (data['script'] == script)]
    
    print(trial_data.shape)
    # Perform temporal abstraction on the current trial data
    abstracted_trial = ca.abstract_categorical(trial_data, cols, match, min_support, window_size, max_pattern_size)
    abstracted_data = pd.concat([abstracted_data, abstracted_trial])


# Save the resulting dataset with abstracted patterns
output_path = '/Users/macbookair/Desktop/ML4QS_Group125/data_w_features/data_w_temporal_patterns.csv'
abstracted_data.to_csv(output_path, index=False)

abstracted_data.head()


(346, 75)
amplitude_mean_kalman_categorical_high


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_table[self.pattern_prefix + self.to_string(pattern)] = 0


IndexError: index 346 is out of bounds for axis 0 with size 346