In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.utils import to_categorical

# --- Load and clean ---
def load_sequences(filename):
    with open(filename, 'r') as file:
        return [line.strip() for line in file if line.strip()]

def clean_sequence(seq):
    return re.sub(r'[^ACGT]', '', seq.upper())

# --- k-mer tokenizer ---
def kmer_tokenizer(seq, k=6):
    return ' '.join([seq[i:i+k] for i in range(len(seq) - k + 1)])

# Load data
human = [clean_sequence(s) for s in load_sequences("human_data.txt")]
chimp = [clean_sequence(s) for s in load_sequences("chimp_data.txt")]
dog = [clean_sequence(s) for s in load_sequences("dog_data.txt")]

# Balance data
min_len = min(len(human), len(chimp), len(dog))
human = human[:min_len]
chimp = chimp[:min_len]
dog = dog[:min_len]

# Labels
sequences = human + chimp + dog
labels = ['human'] * min_len + ['chimp'] * min_len + ['dog'] * min_len

# Apply k-mer
kmers = [kmer_tokenizer(seq, k=6) for seq in sequences]

# Tokenize
tokenizer = Tokenizer(char_level=False)
tokenizer.fit_on_texts(kmers)
X = tokenizer.texts_to_sequences(kmers)
X = pad_sequences(X, maxlen=500)  # pad to uniform length

# Encode labels
le = LabelEncoder()
y = le.fit_transform(labels)
y_cat = to_categorical(y)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y_cat, test_size=0.2, random_state=42)

# --- Build CNN model ---
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=128, input_length=500))
model.add(Conv1D(filters=64, kernel_size=5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dense(3, activation='softmax'))  # 3 classes

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# --- Train ---
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# --- Evaluate ---
loss, acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {acc:.2%}")




Epoch 1/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 128ms/step - accuracy: 0.3390 - loss: 1.0949 - val_accuracy: 0.5396 - val_loss: 1.0006
Epoch 2/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 103ms/step - accuracy: 0.5553 - loss: 0.9640 - val_accuracy: 0.6308 - val_loss: 0.8768
Epoch 3/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 127ms/step - accuracy: 0.6864 - loss: 0.7833 - val_accuracy: 0.6227 - val_loss: 0.7979
Epoch 4/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 122ms/step - accuracy: 0.7251 - loss: 0.6646 - val_accuracy: 0.6673 - val_loss: 0.7469
Epoch 5/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 115ms/step - accuracy: 0.8176 - loss: 0.5076 - val_accuracy: 0.6755 - val_loss: 0.7654
Epoch 6/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 122ms/step - accuracy: 0.8383 - loss: 0.4513 - val_accuracy: 0.6815 - val_loss: 0.7626
Epoch 7/10
[1m62/62[0m 

In [None]:
import pickle

# Save the model in the new Keras format
model.save("dna_cnn_model.keras")

# Save tokenizer
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

# Save label encoder
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)



In [None]:
from google.colab import files

# Download model
files.download("dna_cnn_model.keras")

# Download tokenizer
files.download("tokenizer.pkl")

# Download label encoder
files.download("label_encoder.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>