In [29]:
# Import necessary libraries
import os
import pandas as pd
from Bio import SeqIO
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout
from tqdm import tqdm
from pathos.multiprocessing import ProcessingPool as Pool

# Configure TensorFlow to use GPU
physical_devices = tf.config.list_physical_devices('GPU')
try:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
    # Invalid device or cannot modify virtual devices once initialized.
    pass

# Function to parse a single .fna file
def parse_fna_file(filepath):
    sequences = []
    filenames = []
    for record in SeqIO.parse(filepath, "fasta"):
        sequences.append(str(record.seq))
        filenames.append(os.path.basename(filepath))
    return sequences, filenames

# Function to parse all .fna files using multiprocessing
def parse_fna_files(directory):
    filepaths = [os.path.join(directory, filename) for filename in os.listdir(directory) if filename.endswith(".fna")]
    with Pool() as pool:
        results = list(tqdm(pool.imap(parse_fna_file, filepaths), total=len(filepaths), desc="Parsing .fna files"))
    sequences = [seq for result in results for seq in result[0]]
    filenames = [fname for result in results for fname in result[1]]
    return sequences, filenames

# Function to get k-mers from a sequence
def get_kmers(sequence, size=6):
    return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]

# Function to extract features from sequences using multiprocessing
def extract_features(sequences, k=6):
    with Pool() as pool:
        kmers = list(tqdm(pool.imap(lambda seq: ' '.join(get_kmers(seq, k)), sequences), total=len(sequences), desc="Extracting k-mers"))
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(kmers)
    return X, vectorizer

def main():
    # Directory containing your .fna files
    fna_directory = './fna/'

    # Parse .fna files
    sequences, filenames = parse_fna_files(fna_directory)

    # Extract features
    X, vectorizer = extract_features(sequences)

    # Load labels from a CSV file
    labels_df = pd.read_csv('label.csv')
    labels_df['resistant_phenotype'] = labels_df['resistant_phenotype'].map({'resistant': 1, 'susceptible': 0})

    # Map labels to sequences based on filenames
    labels = []
    for filename in tqdm(filenames, desc="Mapping labels"):
        genbank_id = '.'.join(filename.split('.')[:2])  # Correctly handle filenames with multiple periods
        label_row = labels_df.loc[labels_df['genbank_id'] == genbank_id, 'resistant_phenotype']
        if not label_row.empty:
            label = label_row.values[0]
            labels.append(label)
        else:
            print(f"Warning: GenBank ID {genbank_id} not found in labels file. Skipping this file.")
            # Optionally, you can append a default label or handle it differently
            # labels.append(default_label)

    # Convert labels to a numpy array and ensure they are numerical
    y = pd.Series(labels).astype(float).values

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Perform PCA for dimension reduction
    pca = PCA(n_components=100)  # Adjust the number of components as needed
    X_train_pca = pca.fit_transform(X_train.toarray())
    X_test_pca = pca.transform(X_test.toarray())

    # Define the deep learning model
    model = Sequential([
        Dense(128, activation='relu', input_shape=(X_train_pca.shape[1],)),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model with a progress bar
    history = model.fit(X_train_pca, y_train, epochs=10, batch_size=32, validation_data=(X_test_pca, y_test), verbose=1)

    # Evaluate the model
    loss, accuracy = model.evaluate(X_test_pca, y_test)
    print(f'Test Accuracy: {accuracy:.4f}')

    # Save the model
    model.save('amr_prediction_model.h5')

    # Load the model for future use
    # model = load_model('amr_prediction_model.h5')

    # Make predictions
    # predictions = model.predict(X_test_pca)

if __name__ == '__main__':
    main()


Parsing .fna files: 100%|██████████| 36/36 [00:00<00:00, 67.05it/s]
Extracting k-mers: 100%|██████████| 128718/128718 [02:32<00:00, 845.09it/s] 
Mapping labels: 100%|██████████| 128718/128718 [00:33<00:00, 3891.89it/s]


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3218/3218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - accuracy: 0.5928 - loss: 0.8397 - val_accuracy: 0.6893 - val_loss: 0.6075
Epoch 2/10
[1m3218/3218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.6659 - loss: 0.6192 - val_accuracy: 0.6877 - val_loss: 0.5895
Epoch 3/10
[1m3218/3218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.6820 - loss: 0.5993 - val_accuracy: 0.6906 - val_loss: 0.5834
Epoch 4/10
[1m3218/3218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.6868 - loss: 0.5911 - val_accuracy: 0.7018 - val_loss: 0.5690
Epoch 5/10
[1m3218/3218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.6911 - loss: 0.5872 - val_accuracy: 0.6889 - val_loss: 0.5747
Epoch 6/10
[1m3218/3218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.6928 - loss: 0.5815 - val_accuracy: 0.6990 - val_loss: 0.5702
Epoch 7/10
[1m3218/3



Test Accuracy: 0.7035


In [7]:
import tensorflow as tf
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D

In [8]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()

x_train = x_train.reshape(x_train.shape[0], 28, 28, 1)
x_test = x_test.reshape(x_test.shape[0], 28, 28, 1)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255

y_train = tf.keras.utils.to_categorical(y_train, 10)
y_test = tf.keras.utils.to_categorical(y_test, 10)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


In [9]:
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(28, 28, 1)))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(10, activation='softmax'))

model.compile(loss=tf.keras.losses.categorical_crossentropy,
              optimizer=tf.keras.optimizers.Adadelta(),
              metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [10]:
model.fit(x_train, y_train,
          batch_size=128,
          epochs=10,
          verbose=1,
          validation_data=(x_test, y_test))

Epoch 1/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 28ms/step - accuracy: 0.1037 - loss: 2.3006 - val_accuracy: 0.1732 - val_loss: 2.2710
Epoch 2/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 28ms/step - accuracy: 0.1725 - loss: 2.2666 - val_accuracy: 0.3905 - val_loss: 2.2273
Epoch 3/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 28ms/step - accuracy: 0.2656 - loss: 2.2240 - val_accuracy: 0.5631 - val_loss: 2.1692
Epoch 4/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 28ms/step - accuracy: 0.3574 - loss: 2.1669 - val_accuracy: 0.6418 - val_loss: 2.0901
Epoch 5/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 29ms/step - accuracy: 0.4417 - loss: 2.0882 - val_accuracy: 0.6892 - val_loss: 1.9828
Epoch 6/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 28ms/step - accuracy: 0.5007 - loss: 1.9840 - val_accuracy: 0.7179 - val_loss: 1.8410
Epoch 7/10
[1m4

<keras.src.callbacks.history.History at 0x7f8b58675bb0>

In [11]:
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 1.0944098234176636
Test accuracy: 0.8021000027656555


In [12]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0
