INCONSISTENCY HANDLING

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from node2vec import Node2Vec
import logging
import os

# Configure logging to include the timestamp, log level, and message, which is helpful for debugging and tracking.
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def read_sequences_from_file(filename):
    """Read sequences and their associated labels from a tab-separated values file."""
    with open(filename, 'r') as file:
        return [line.strip().split('\t') for line in file if line.strip()]

def chop_into_kmers(sequences, k):
    """Divide each sequence into subsequences (k-mers) of length k."""
    return [(seq[i:i+k], label) for seq, label in sequences for i in range(len(seq) - k + 1)]

def find_overlaps(kmers, min_overlap_length):
    """Identify k-mers that overlap by at least 'min_overlap_length' characters."""
    overlaps = []
    for i, (kmer1, _) in enumerate(kmers):
        for j, (kmer2, _) in enumerate(kmers):
            if i != j:
                length = min(len(kmer1), len(kmer2))
                for ol in range(min_overlap_length, length + 1):
                    if kmer1.endswith(kmer2[:ol]):
                        overlaps.append((kmer1, kmer2, ol))
                        break
    return overlaps

def construct_overlap_graph(overlaps):
    """Build a graph from overlaps where each node represents a k-mer."""
    graph = nx.Graph()
    for kmer1, kmer2, weight in overlaps:
        graph.add_edge(kmer1, kmer2, weight=weight)
    return graph

def plot_graph(graph, file_name):
    """Generate a visual representation of the overlap graph."""
    plt.figure(figsize=(10, 10))
    pos = nx.spring_layout(graph, scale=2, k=1/(graph.order()**0.5)*2)
    labels = {node: node[:5] + '...' + node[-5:] for node in graph.nodes()}
    nx.draw(graph, pos, labels=labels, with_labels=True, node_size=50, font_size=8)
    plt.savefig(file_name)
    plt.close()

def node2vec_embedding(graph):
    """Apply the Node2Vec algorithm to produce vector embeddings for each node in the graph."""
    if graph.number_of_nodes() == 0:
        logging.warning("Graph is empty. Skipping embedding.")
        return None
    node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4)
    model = node2vec.fit(window=10, min_count=1, batch_words=4)
    return model

def export_graph(graph, export_file_name):
    """Save the graph in GraphML format."""
    nx.write_graphml(graph, export_file_name)

def evaluate_graph(graph):
    """Evaluate the quality of the graph by calculating several key metrics."""
    density = nx.density(graph)
    num_connected_components = nx.number_connected_components(graph)
    avg_clustering_coefficient = nx.average_clustering(graph)
    logging.info(f"Graph Density: {density}, Connected Components: {num_connected_components}, Average Clustering Coefficient: {avg_clustering_coefficient}")
    return {'density': density, 'num_components': num_connected_components, 'avg_clustering': avg_clustering_coefficient}

def process_kmers(filename, k, min_overlap_values):
    """Process k-mers, construct graphs, and perform classification."""
    base_filename = os.path.splitext(os.path.basename(filename))[0]
    results = {}
    sequences = read_sequences_from_file(filename)
    kmers = chop_into_kmers(sequences, k)
    kmer_dict = {kmer: label for kmer, label in kmers}

    for min_overlap in min_overlap_values:
        if min_overlap > k:
            logging.warning(f"Skipping min_overlap={min_overlap} as it is greater than k={k}")
            continue
        overlaps = find_overlaps(kmers, min_overlap)
        graph = construct_overlap_graph(overlaps)
        if not graph.number_of_edges():
            logging.warning(f"Graph is empty for k={k} and min_overlap={min_overlap}. Skipping.")
            continue

        graph_metrics = evaluate_graph(graph)
        graph_dir = f'graphs/{base_filename}_k{k}_min{min_overlap}'
        os.makedirs(graph_dir, exist_ok=True)
        graph_file_name = f'{graph_dir}/overlap_graph_k{k}_min{min_overlap}.png'
        plot_graph(graph, graph_file_name)
        export_graph(graph, f'{graph_dir}/graph_k{k}_min{min_overlap}.graphml')

        model = node2vec_embedding(graph)
        if model is None:
            continue
        embeddings = model.wv.vectors
        node_ids = model.wv.index_to_key
        embeddings_df = pd.DataFrame(embeddings, index=node_ids)
        
        y = pd.Series({node: kmer_dict[node] for node in node_ids if node in kmer_dict})
        X = embeddings_df
        y = y.reindex(X.index)  # Ensure y is aligned with X's index

        if y.nunique() < 2:
            logging.warning(f"Not enough classes to train SVM for k={k} and min_overlap={min_overlap}. Need at least 2 classes.")
            continue

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        classifier = svm.SVC()
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)

        report = classification_report(y_test, y_pred, output_dict=True)
        accuracy = accuracy_score(y_test, y_pred)
        results[(k, min_overlap)] = {'report': report, 'accuracy': accuracy, 'graph_metrics': graph_metrics}
        logging.info(f"Completed min_overlap={min_overlap} for k={k}")

    return results

filename = 'input_data/hum_med_5.txt'
k_values = [150, 250, 350]
min_overlap_values = [50, 100, 149]
for k in k_values:
    results = process_kmers(filename, k, min_overlap_values)
    logging.info(f"Results for k={k}: {results}")

# Print the results
for key, value in results.items():
    print(f"Results for k={key[0]} and min_overlap={key[1]}:")
    print(f"Accuracy: {value['accuracy']}")
    print("Classification Report:")
    print(value['report'])
    print("Graph Metrics:")
    print(value['graph_metrics']['density'], value['graph_metrics']['num_components'], value['graph_metrics']['avg_clustering'])
