#Step 1: Import

In [89]:
!pip install gdown



In [90]:
from google.colab import drive
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Step 2: Retreiving the data

In [91]:
test_path='/content/drive/MyDrive/ML_PROJECT/test_set.csv'
train_path='/content/drive/MyDrive/ML_PROJECT/training_set.csv'

In [92]:
training_set = pd.read_csv(train_path)
test_set = pd.read_csv(test_path)

#Step 3 : Preprocessing

In [93]:
def build_kmers(sequence, ksize=6):
    return [sequence[i:i + ksize] for i in range(len(sequence) - ksize + 1)]

In [94]:
def preprocess_data(dataset):
    kmer_list = []
    labels = []
    for index, row in dataset.iterrows():
        seq = row['Sequence']
        kmer = build_kmers(seq)
        kmer_list.append(kmer)
        if 'Type' in row:
            labels.append(int(row['Type'][5]) - 1)
    return kmer_list, labels

In [95]:
X_train, train_labels = preprocess_data(training_set)
X_test, _ = preprocess_data(test_set)

In [96]:
print(len(train_labels))
print(len(X_train))
print(len(X_test))

1320
1320
400


In [88]:
w2v_model = Word2Vec(X_train, min_count=1, window=5, vector_size=100, workers=2)

In [97]:
sample_seq = X_train[0]
sample_seq_kmer = sample_seq[0]
print(f'sample kmer: {sample_seq_kmer}')
print(f'Sample kmer w2v: {w2v_model.wv[sample_seq_kmer]}')

sample kmer: TACCAC
Sample kmer w2v: [ 2.3776479   2.6899946  -0.33890477 -1.1045977  -4.639715    1.9600029
  3.7117543  -2.9170442  -2.0395372  -1.5153934  -0.69263804  0.68987876
  1.527658    0.2610632   1.7462404  -0.4663194  -0.18102738 -3.9838674
  0.8829599  -0.19136968 -1.8968443  -0.09358894  2.1116765   4.033797
 -5.2704387  -2.6897018   3.5855355  -0.52097887  3.8898957  -0.57485557
  3.7048821   0.63062936 -1.2361848  -1.8291693  -5.157497    0.9854158
  1.2868885  -0.69918823 -2.1952283  -3.0263798  -2.1148303   1.0708619
 -2.9052565  -1.8970504   5.9707327  -0.6581267  -2.3918142  -1.910047
 -4.0771403  -0.8347278  -4.0109587  -1.831637   -3.4295733   1.6310542
 -2.1554422  -5.786013   -0.31428227 -2.3201156  -3.7162282   1.4674144
  2.8870544  -5.9982514   4.0642004  -5.2227087  -1.5005418   5.4540305
  2.6667504  -0.9510236   2.3661218  -0.9992615  -0.42011404  2.8512185
 -2.096042    2.8722057   0.29571018 -1.7142171   0.8305318   1.8911843
  0.9015202  -0.04330012  1

In [98]:
def create_seqs_embedding(w2v_model, seqs):
  seqs_embedding = np.array([np.zeros(100)])
  for seq in seqs:
    vector = np.zeros(100)
    for word in seq:
      vector += w2v_model.wv[word]
    number_of_word_in_seq = len(seq)
    normalized_vector = vector / number_of_word_in_seq
    seqs_embedding = np.append(seqs_embedding, [normalized_vector], axis=0)

  return seqs_embedding[1: ]

In [139]:
X_train_embedding = create_seqs_embedding(w2v_model, X_train)

In [140]:
X_test_embedding = create_seqs_embedding(w2v_model, X_test)

In [141]:
num_classes = max(train_labels) + 1  # Find the number of unique classes
train_labels = np.array(train_labels)

# Convert to one-hot encoding
train_labels_modified = np.eye(num_classes)[train_labels]

#Step 4 : Data Splitting





In [142]:
X_train_embedding, X_val_embedding, Y_train_labels, Y_val_labels = train_test_split(
    X_train_embedding, train_labels_modified, test_size=0.2, random_state=42)

In [143]:
print(X_train_embedding.shape)
print(X_val_embedding.shape)
print(Y_train_labels.shape)
print(Y_val_labels.shape)

(1056, 100)
(264, 100)
(1056, 6)
(264, 6)



#Step 5 : Building Model and validation Accuracy


In [104]:
def activate(x, activation_fn):
    if activation_fn == 'sigmoid':
        return 1 / (1 + np.exp(-x))
    elif activation_fn == 'tanh':
        return np.tanh(x)
    elif activation_fn == 'relu':
        return np.maximum(0, x)
    else:
        raise ValueError("Activation function not supported.")

def activate_derivative(x, activation_fn):
    if activation_fn == 'sigmoid':
        return x * (1 - x)
    elif activation_fn == 'tanh':
        return 1 - x ** 2
    elif activation_fn == 'relu':
        return np.where(x > 0, 1, 0)
    else:
        raise ValueError("Activation function not supported.")

In [105]:

# Global variable to keep track of model parameters
model_params = {}

def initialize_network(X, Y, hidden_layer_sizes, activation_fn='relu', init_method='he'):
    """
    Initialize network with specified hidden layer sizes.
    hidden_layer_sizes: List of integers representing each hidden layer's size.
    """
    input_size = X.shape[1]
    output_size = Y.shape[1]

    # Store activation function type
    model_params['activation_fn'] = activation_fn
    model_params['input_size'] = input_size
    model_params['output_size'] = output_size
    model_params['hidden_layer_sizes'] = hidden_layer_sizes

    # Initialize weights and biases for each layer
    layer_sizes = [input_size] + hidden_layer_sizes + [output_size]
    for l in range(len(layer_sizes) - 1):
        if init_method == 'he':
            scale = np.sqrt(2 / layer_sizes[l])  # He initialization
        else:
            scale = np.sqrt(1 / layer_sizes[l])  # Default (Glorot/Xavier)

        model_params[f'weights_{l + 1}'] = np.random.randn(layer_sizes[l], layer_sizes[l + 1]) * scale
        model_params[f'bias_{l + 1}'] = np.zeros((1, layer_sizes[l + 1]))
        model_params[f'weights_{l + 1}_adagrad'] = np.zeros_like(model_params[f'weights_{l + 1}'])
        model_params[f'bias_{l + 1}_adagrad'] = np.zeros_like(model_params[f'bias_{l + 1}'])


def forward_pass(X, verbose=False):
    # Perform forward pass through all hidden layers and output layer
    A = X
    caches = {}
    for l in range(1, len(model_params['hidden_layer_sizes']) + 2):
        W = model_params[f'weights_{l}']
        b = model_params[f'bias_{l}']
        Z = np.dot(A, W) + b
        A = activate(Z, model_params['activation_fn'])
        caches[f'Z_{l}'] = Z
        caches[f'A_{l}'] = A

        if verbose:
            print(f"Layer {l}: output shape = {A.shape}")

    return caches

def back_propagate(X, y, caches, learning_rate, l2_reg=0.01):
    """
    Perform backpropagation through all layers, including L2 regularization.
    """
    L = len(model_params['hidden_layer_sizes']) + 1  # Number of layers

    # Output layer calculations
    output = caches[f'A_{L}']
    error_output = y - output
    delta_output = error_output * activate_derivative(output, model_params['activation_fn'])

    # Backpropagate through all hidden layers
    deltas = {L: delta_output}
    for l in range(L - 1, 0, -1):
        Z = caches[f'Z_{l}']
        A = caches[f'A_{l}']
        W_next = model_params[f'weights_{l + 1}']
        deltas[l] = (deltas[l + 1].dot(W_next.T)) * activate_derivative(A, model_params['activation_fn'])

    # Update all layers with Adagrad and L2 regularization
    for l in range(1, L + 1):
        W = model_params[f'weights_{l}']
        b = model_params[f'bias_{l}']
        A_prev = X if l == 1 else caches[f'A_{l - 1}']

        # Adagrad optimization update
        model_params[f'weights_{l}_adagrad'] += np.square(A_prev.T.dot(deltas[l]))
        model_params[f'bias_{l}_adagrad'] += np.square(np.sum(deltas[l], axis=0, keepdims=True))

        weight_update = (A_prev.T.dot(deltas[l]) / (np.sqrt(model_params[f'weights_{l}_adagrad']) + 1e-8)) * learning_rate
        bias_update = (np.sum(deltas[l], axis=0, keepdims=True) / (np.sqrt(model_params[f'bias_{l}_adagrad']) + 1e-8)) * learning_rate

        model_params[f'weights_{l}'] += weight_update - learning_rate * l2_reg * W
        model_params[f'bias_{l}'] += bias_update





In [106]:
def train(X_train, y_train, epochs, learning_rate, l2_reg=0.01):
    for epoch in range(epochs):
        caches = forward_pass(X_train)
        back_propagate(X_train, y_train, caches, learning_rate, l2_reg=l2_reg)
    y_train_normalized = (y_train - y_train.mean()) / y_train.std()
    final_output = caches[f'A_{len(model_params["hidden_layer_sizes"]) + 1}']
    print(f"Train Error MSE: %.5f" % mse(y_train_normalized, final_output))
    print(f"Training Accuracy: {(np.round(final_output) == y_train).mean()}")

def mse(y1, y2):
    return np.mean((y1 - y2) ** 2)

In [154]:
initialize_network(X_train_embedding, Y_train_labels, hidden_layer_sizes=[64, 64, 32,32], activation_fn='relu', init_method='he')
train(X_train_embedding, Y_train_labels, epochs=1000, learning_rate=0.001, l2_reg=0.01)

Train Error MSE: 0.61515
Training Accuracy: 0.944760101010101


In [135]:
def evaluate_model(X, y_true):
    caches = forward_pass(X)
    final_output = caches[f'A_{len(model_params["hidden_layer_sizes"]) + 1}']

    # For classification: Predict classes
    predicted_classes = np.argmax(final_output, axis=1)
    true_classes = np.argmax(y_true, axis=1)

    # Compute accuracy
    accuracy = np.mean(predicted_classes == true_classes)

    # Compute MSE based on class indices for educational purposes (not common in practice for classification)
    mse_value = np.mean((predicted_classes - true_classes) ** 2)

    return accuracy, mse_value


In [155]:
val_accuracy, mse_value = evaluate_model(X_val_embedding, Y_val_labels)

# Print the results
print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"TEST ERROR MSE: {mse_value:.4f}")

Validation Accuracy: 0.8295
TEST ERROR MSE: 0.1818



#Step 6 : Testing with new Data

In [110]:
def test(X_test):
    caches = forward_pass(X_test)
    final_output = caches[f'A_{len(model_params["hidden_layer_sizes"]) + 1}']
    predicted_classes = np.argmax(final_output, axis=1)
    print(f"Predicted Classes: {predicted_classes}")

In [156]:
test(X_test_embedding)

Predicted Classes: [4 4 2 0 5 0 5 2 3 0 3 2 0 3 3 2 5 2 2 2 3 0 4 5 0 4 2 2 2 2 0 2 5 0 5 0 2
 4 4 5 4 3 4 4 2 5 3 4 0 0 3 2 3 2 2 5 2 5 4 2 0 2 2 2 2 5 3 4 5 5 5 2 0 5
 0 0 3 5 5 0 4 3 0 0 2 4 5 3 2 4 2 0 0 5 4 2 4 0 3 0 2 2 0 0 2 3 2 2 2 2 2
 0 3 0 3 3 2 2 2 3 5 0 5 3 4 2 5 0 2 2 2 5 4 0 0 2 0 0 2 2 4 0 5 3 0 0 0 2
 0 3 5 5 0 5 2 4 4 2 5 2 0 0 3 3 2 2 4 3 0 0 0 2 5 0 2 0 5 3 4 5 4 5 4 4 5
 2 2 2 5 3 5 2 2 5 5 5 0 0 4 3 0 2 2 0 0 0 4 2 0 0 0 0 0 2 2 5 2 2 3 4 5 3
 0 3 2 3 5 4 3 5 4 4 5 5 3 4 2 5 5 2 0 0 3 3 3 4 0 0 5 0 5 2 0 0 2 0 2 5 5
 4 2 2 5 0 2 2 0 0 4 3 0 2 0 0 0 0 2 2 2 5 0 5 3 0 2 3 2 2 3 2 0 2 5 5 0 0
 5 0 0 5 5 2 3 0 5 4 5 2 3 0 3 2 0 5 4 4 0 2 4 3 4 0 2 2 3 0 0 0 0 0 2 4 0
 2 2 3 0 4 0 4 3 2 0 2 2 0 4 3 3 2 0 4 3 0 0 0 0 0 0 2 4 5 0 5 2 0 5 3 0 0
 0 0 2 4 5 2 0 0 3 0 0 5 2 5 2 3 0 5 5 2 2 0 0 5 2 0 2 0 4 0]
