Cell 1: Import Libraries

In [2]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from karateclub import Graph2Vec


Cell 2: Define File Reading and k-mer Generation Functions

In [4]:
def read_sequences_from_file(filename):
    with open(filename, 'r') as file:
        return [line.strip().split('\t') for line in file if line.strip()]

def chop_into_kmers(sequences, k):
    kmers = []
    for seq, label in sequences:
        for i in range(len(seq) - k + 1):
            kmers.append((seq[i:i+k], label))
    return kmers


Cell 3: Define Overlap Finding Function

In [5]:
def find_overlaps(kmers, min_overlap_length):
    overlaps = []
    for i, (kmer1, _) in enumerate(kmers):
        for j, (kmer2, _) in enumerate(kmers):
            if i != j:
                length = min(len(kmer1), len(kmer2))
                for ol in range(min_overlap_length, length + 1):
                    if kmer1.endswith(kmer2[:ol]):
                        overlaps.append((kmer1, kmer2, ol))
                        break
    return overlaps


Cell 4: Define Graph Construction and Plotting Functions

In [6]:
def construct_overlap_graph(overlaps):
    graph = nx.Graph()
    kmer_to_index = {}
    current_index = 0

    for kmer1, kmer2, weight in overlaps:
        for kmer in (kmer1, kmer2):
            if kmer not in kmer_to_index:
                kmer_to_index[kmer] = current_index
                current_index += 1

        index1 = kmer_to_index[kmer1]
        index2 = kmer_to_index[kmer2]
        graph.add_edge(index1, index2, weight=weight)

    nx.set_node_attributes(graph, {index: kmer for kmer, index in kmer_to_index.items()}, 'label')
    return graph

def plot_graph(graph, file_name='overlap_graph.png'):
    plt.figure(figsize=(10, 10))
    labels = nx.get_node_attributes(graph, 'label')
    pos = nx.spring_layout(graph)
    nx.draw(graph, pos, labels=labels, with_labels=True)
    plt.savefig(file_name)
    plt.close()


Cell 5: Define Graph Embedding and Graph Export Functions


In [7]:
def graph2vec_embedding(graph):
    graph2vec = Graph2Vec(dimensions=64)
    graph2vec.fit([graph])
    return graph2vec.get_embedding()

def export_graph(graph, export_file_name):
    nx.write_graphml(graph, export_file_name)


#### Cell 6: Main Function Execution


<!-- def main(filename, k_values, min_overlap_values):
    results = {}
    sequences = read_sequences_from_file(filename)

    for k in k_values:
        kmers = chop_into_kmers(sequences, k)
        for min_overlap in min_overlap_values:
            overlaps = find_overlaps(kmers, min_overlap)
            graph = construct_overlap_graph(overlaps)

            plot_graph(graph, f'overlap_graph_k{k}_min{min_overlap}.png')
            export_graph(graph, f'overlap_graph_k{k}_min{min_overlap}.graphml')

            embeddings = graph2vec_embedding(graph).squeeze()
            embeddings_df = pd.DataFrame([embeddings], index=['Graph'])
            embeddings_df.to_csv(f'embeddings_k{k}_min{min_overlap}.csv')

            y = pd.Series({kmer[:5]+'...'+kmer[-5:]: label for kmer, label in kmers})
            X = embeddings_df
            y = pd.Series([y.iloc[0]])

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            classifier = svm.SVC()
            classifier.fit(X_train, y_train)
            y_pred = classifier.predict(X_test)

            report = classification_report(y_test, y_pred, output_dict=True)
            accuracy = accuracy_score(y_test, y_pred)
            results[(k, min_overlap)] = {'report': report, 'accuracy': accuracy}

    return results

# Example usage
filename = 'hum_med.txt'
k_values = [100, 200, 300]
min_overlap_values = [50, 100, 150]
results = main(filename, k_values, min_overlap_values) -->


Cell 1: Load and Preprocess Data


In [1]:
# Load sequences from file
filename = 'input_data/hum_med_15.txt'
sequences = read_sequences_from_file(filename)


NameError: name 'read_sequences_from_file' is not defined

Cell 2: Generate k-mers for Each k-value


In [9]:
# Generate k-mers for different k values
k_values = [100, 200, 300]
kmers_dict = {k: chop_into_kmers(sequences, k) for k in k_values}


Cell 3: Find Overlaps and Construct Graphs


In [None]:
# Find overlaps and construct graphs
min_overlap_values = [50, 100, 150]
graphs = {}
for k, kmers in kmers_dict.items():
    for min_overlap in min_overlap_values:
        overlaps = find_overlaps(kmers, min_overlap)
        graph = construct_overlap_graph(overlaps)
        graphs[(k, min_overlap)] = graph
        plot_graph(graph, f'overlap_graph_k{k}_min{min_overlap}.png')
        export_graph(graph, f'overlap_graph_k{k}_min{min_overlap}.graphml')


KeyboardInterrupt: 

Cell 4: Embed Graphs and Prepare Data for Classification


In [None]:
# Embed graphs and prepare for classification
results = {}
for (k, min_overlap), graph in graphs.items():
    embeddings = graph2vec_embedding(graph).squeeze()
    embeddings_df = pd.DataFrame([embeddings], index=['Graph'])
    embeddings_df.to_csv(f'embeddings_k{k}_min{min_overlap}.csv')

    # Assuming the class labels are from the k-mers generated with max k-value
    y = pd.Series({kmer[:5]+'...'+kmer[-5:]: label for kmer, label in kmers_dict[max(k_values)]})

    X = embeddings_df
    y = pd.Series([y.iloc[0]])  # Dummy label for the entire graph
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train and evaluate classifier
    classifier = svm.SVC()
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    
    report = classification_report(y_test, y_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_pred)
    results[(k, min_overlap)] = {'report': report, 'accuracy': accuracy}


Cell 5: Output Results


In [None]:
# Display results
for key, result in results.items():
    print(f"Results for k={key[0]} and min_overlap={key[1]}:")
    print(f"Accuracy: {result['accuracy']}")
    print("Classification Report:")
    print(result['report'])
