In [44]:
import numpy as np

# Load the numpy array from the file
data = np.load('/kaggle/input/training/training_data.npy')
valid_periods = np.load('/kaggle/input/training/valid_periods.npy')
data.shape

(48000, 2776)

In [45]:
# Calculate the number of valid points for each function
num_valid_points = valid_periods[:, 1] - valid_periods[:, 0] + 1

# Filter out functions with less than 210 valid points
filtered_data = data[num_valid_points >= 219]

# Check the shape of the filtered data
print(filtered_data.shape)
data = filtered_data

(19826, 2776)


In [46]:
# Shuffle the datasets in unison
perm = np.random.permutation(data.shape[0])
data_shuffled = data[perm]
valid_periods_shuffled = valid_periods[perm]

K = 7500
# Split into training and validation sets
validation_data = data_shuffled[:K]
validation_periods = valid_periods_shuffled[:K]

training_data = data_shuffled[K:]
training_periods = valid_periods_shuffled[K:]

In [47]:
def apply_valid_periods(data, valid_periods):
    """
    Modify each function in 'data' based on the corresponding 'valid_periods'.

    Parameters:
    data (numpy.ndarray): Array of functions, shape (n_samples, n_features).
    valid_periods (numpy.ndarray): Array of valid periods, shape (n_samples, 2).

    Returns:
    numpy.ndarray: Modified data array.
    """
    modified_data = np.zeros_like(data)
    n_samples, n_features = data.shape

    for i in range(n_samples):
        left, right = valid_periods[i]
        # Assuming 'left' and 'right' are indices in the range [0, n_features-1]
        # Adjust them if they are in a different format
        modified_data[i, left:right+1] = data[i, left:right+1]

    return modified_data

# Apply the function to your datasets
modified_training_data = apply_valid_periods(training_data, training_periods)
modified_validation_data = apply_valid_periods(validation_data, validation_periods)
#modified_test_data = apply_valid_periods(test_data, test_periods)


In [48]:
modified_training_data.shape

(12326, 2776)

In [49]:
import numpy as np

def extract_non_overlapping_intervals(data, training_periods, interval_length=218):
    all_intervals = []

    for function, (left, right) in zip(data, training_periods):
        # Adjust right boundary to ensure intervals fit within the range
        right = min(right, len(function))

        # Extract intervals within the valid period
        for start_idx in range(left, right, interval_length):
            end_idx = min(start_idx + interval_length, right)
            interval = function[start_idx:end_idx]

            # Pad the interval if it's shorter than the interval length
            if len(interval) < interval_length:
                interval = np.pad(interval, (0, interval_length - len(interval)), 'constant')

            all_intervals.append(interval)

    return np.array(all_intervals)

# Example usage
final_training_data = extract_non_overlapping_intervals(modified_training_data, training_periods)
final_validation_data = extract_non_overlapping_intervals(modified_validation_data, validation_periods)
# final_test_data = extract_non_overlapping_intervals(modified_test_data, test_periods)


In [50]:
final_training_data.shape

(18767, 218)

In [51]:
import tensorflow as tf
import joblib
encoder = tf.keras.models.load_model('/kaggle/input/models/encoder_model.h5')
svm = joblib.load('/kaggle/input/models/svm_model.joblib')

In [52]:
# Assuming final_training_data and final_validation_data have shape (n_samples, 209)
# and you want to select the first 200 columns
X_train_modified = final_training_data[:, :200]
X_val_modified = final_validation_data[:, :200]


def predict_cluster(encoder, svm, data):
    """
    Predict cluster assignments using an encoder and SVM classifier.

    Parameters:
    encoder (tf.keras.Model): Trained Keras encoder model.
    svm (sklearn.svm.SVC or similar): Trained SVM classifier.
    data (numpy.ndarray): Data to be clustered, shape (n_samples, n_features).

    Returns:
    numpy.ndarray: Cluster assignments.
    """
    # Assuming the encoder expects data with an additional dimension
    data_reshaped = data.reshape((data.shape[0], data.shape[1], 1))

    # Encode the data
    encoded_data = encoder.predict(data_reshaped)

    # Flatten the encoded data if necessary (depends on encoder's output shape)
    if len(encoded_data.shape) > 2:
        encoded_data = encoded_data.reshape((encoded_data.shape[0], -1))

    # Predict probabilities
    #probabilities = svm.predict_proba(encoded_data)

    # Select the most probable cluster among the first four
    #cluster_assignments = np.argmax(probabilities[:, :4], axis=1)
    # Use the SVM classifier to predict clusters
    cluster_assignments = svm.predict(encoded_data)

    return cluster_assignments


# Assuming you have an 'encoder' and a 'classifier'
# and a function 'predict_cluster' that returns cluster assignments
cluster_assignments_train = predict_cluster(encoder, svm, X_train_modified)
cluster_assignments_train[:] = 0
cluster_assignments_val = predict_cluster(encoder, svm, X_val_modified)
cluster_assignments_val[:] = 0

def split_data_by_clusters(original_data, clusters):
    cluster_data = {}
    for cluster in set(clusters):
        cluster_data[cluster] = original_data[clusters == cluster]
    return cluster_data

# final_training_data and final_validation_data have the original shape (something, 209)
training_data_by_cluster = split_data_by_clusters(final_training_data, cluster_assignments_train)
validation_data_by_cluster = split_data_by_clusters(final_validation_data, cluster_assignments_val)




In [53]:
import numpy as np
from collections import Counter

# Assuming cluster_assignments_train is your array
# cluster_assignments_train = np.array([...])  # Your array here

# Count occurrences using NumPy
unique, counts = np.unique(cluster_assignments_train, return_counts=True)
cluster_counts = dict(zip(unique, counts))

# Alternatively, using collections.Counter
# cluster_counts = Counter(cluster_assignments_train)

print(cluster_counts)


{0: 18767}


In [62]:
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.optimizers import SGD
from keras.callbacks import EarlyStopping, TerminateOnNaN
import tensorflow as tf

def train_model_for_cluster(full_train_cluster_data, full_val_cluster_data, epochs=30, batch_size=64):
    """
    Train a neural network model for a specific data cluster, predicting the last 9 points.

    Parameters:
    full_train_cluster_data (numpy.ndarray): Full training data for the cluster.
    full_val_cluster_data (numpy.ndarray): Full validation data for the cluster.
    epochs (int): Number of epochs to train.
    batch_size (int): Batch size for training.

    Returns:
    tf.keras.Model: Trained Keras model.
    """

    # Split data into features (X) and labels (y)
    print(full_train_cluster_data.shape, full_val_cluster_data.shape)
    X_train_cluster = full_train_cluster_data[:, :-18]
    y_train_cluster = full_train_cluster_data[:, -18:]
    X_val_cluster = full_val_cluster_data[:, :-18]
    y_val_cluster = full_val_cluster_data[:, -18:]

    # Define the model architecture
    model = Sequential()
    model.add(LSTM(50, activation='relu', input_shape=(X_train_cluster.shape[1], 1)))
    model.add(Dense(18))  # Predicting 9 points

    # Compile the model
    optimizer = SGD(lr=0.0001)  # Using a lower learning rate with gradient clipping
    model.compile(optimizer=optimizer, loss='mse')

    # Reshape data for LSTM
    X_train_reshaped = X_train_cluster.reshape((X_train_cluster.shape[0], X_train_cluster.shape[1], 1))
    X_val_reshaped = X_val_cluster.reshape((X_val_cluster.shape[0], X_val_cluster.shape[1], 1))

    # Define callbacks
    early_stopping = EarlyStopping(monitor='val_loss', patience=10)
    terminate_on_nan = TerminateOnNaN()

    # Train the model
    model.fit(X_train_reshaped, y_train_cluster, validation_data=(X_val_reshaped, y_val_cluster), epochs=epochs, batch_size=batch_size, callbacks=[early_stopping, terminate_on_nan])

    return model


In [63]:
model = train_model_for_cluster(training_data_by_cluster[0], validation_data_by_cluster[0])
model.save('model.keras')

(18767, 218) (11462, 218)
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
