## Section 1: Scraping ArXiv for 25 research papers

In [None]:
import requests
from bs4 import BeautifulSoup
import csv

def scrape_arxiv_papers(search_url, output_csv="arxiv_papers.csv"):
    """
    Scrapes ArXiv search results to extract research paper titles and PDF links.
    """
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(search_url, headers=headers)

    if response.status_code != 200:
        print("Failed to retrieve page.")
        return

    soup = BeautifulSoup(response.text, "html.parser")
    papers = soup.find_all("li", class_="arxiv-result")

    paper_data = []
    for paper in papers:
        title_tag = paper.find("p", class_="title is-5 mathjax")
        title = title_tag.text.strip() if title_tag else "N/A"

        pdf_link_tag = paper.find("p", class_="list-title is-inline-block")
        pdf_link = "N/A"
        if pdf_link_tag:
            pdf_link_a = pdf_link_tag.find("a", string="pdf")
            pdf_link = pdf_link_a["href"] if pdf_link_a else "N/A"

        if pdf_link!="N/A" : paper_data.append([title, pdf_link])

    with open(output_csv, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["Title", "PDF Link"])
        writer.writerows(paper_data)

    print(f"Saved {len(paper_data)} papers to {output_csv}")

# Example usage
search_url = "https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term=algorithms&terms-0-field=all&terms-1-operator=OR&terms-1-term=data+processing+&terms-1-field=all&terms-2-operator=OR&terms-2-term=machine+learning&terms-2-field=all&terms-3-operator=OR&terms-3-term=llm&terms-3-field=all&terms-4-operator=OR&terms-4-term=analytics&terms-4-field=all&classification-computer_science=y&classification-physics_archives=all&classification-statistics=y&classification-include_cross_list=exclude&date-filter_by=all_dates&date-year=&date-from_date=&date-to_date=&date-date_type=submitted_date&abstracts=show&size=25&order=-announced_date_first"
scrape_arxiv_papers(search_url)


In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
!pip install -q langchain
!pip install -q langchain-community
!pip install -q langchain-chroma
!pip install -q langchain-huggingface
!pip install -q -U bitsandbytes
!pip install -q bs4
!pip install -q rank_bm25
!pip install -q huggingface_hub
!pip install -q requests
!pip install PyPDF2
!pip install bert-score
!pip install faiss-cpu
!pip install networkx matplotlib 
!pip install python-Levenshtein

In [None]:
import os
import requests
import numpy as np
from bs4 import BeautifulSoup
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter

def download_pdf_from_link(pdf_url, save_path):
    print("Downloading PDF from the link...")
    response = requests.get(pdf_url)
    if response.status_code == 200:
        with open(save_path, 'wb') as pdf_file:
            pdf_file.write(response.content)
        print(f"PDF downloaded successfully and saved to {save_path}")
        return save_path
    else:
        raise Exception(f"Failed to download PDF. HTTP Status Code: {response.status_code}")


def split_text_into_documents(text: str, chunk_size: int = 1000, overlap: int = 100):
    splitter = CharacterTextSplitter(separator="\n", chunk_size=chunk_size, chunk_overlap=overlap)
    chunks = splitter.split_text(text)
    return [Document(page_content=chunk) for chunk in chunks]


## Section 2: LLM Setup 

In [None]:
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
def load_mistral_llm():
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        "mistralai/Mistral-7B-Instruct-v0.3",
        quantization_config=quantization_config,
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
    llm =  HuggingFacePipeline(
        pipeline=pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=1024,  
            min_length=30
        )
    )
    return llm, tokenizer

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

def load_llama_llm():
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True
    )
    model_name = "meta-llama/Llama-3.1-8B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map="auto"
    )
    return model, tokenizer

## Section 3: LLM Initialization

In [None]:
from langchain_huggingface import HuggingFaceEndpoint
import torch
import gc
from langchain_huggingface import HuggingFacePipeline
from transformers import pipeline
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

mistral_llm, mistral_tokenizer = load_mistral_llm()
llama_llm, llama_tokenizer = load_llama_llm()

def setup_llm(model_name):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if model_name == "MistralAI":
        return mistral_llm, mistral_tokenizer
    elif model_name == "Llama":
        return llama_llm, llama_tokenizer
    elif model_name == "BART-Base":
        bart_pipeline = pipeline(
            "summarization",
            model="facebook/bart-base",
            tokenizer="facebook/bart-base",
            max_length=512,
            min_length=30,
            device=0 if torch.cuda.is_available() else -1
        )
        pipe = HuggingFacePipeline(pipeline=bart_pipeline)
        bart_tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
        return pipe, bart_tokenizer
    elif model_name == "T5-Base": 
        return HuggingFacePipeline(
            pipeline=pipeline(
                "text2text-generation",
                model="google-t5/t5-base",
                tokenizer="google-t5/t5-base",
                max_length=512,  
                min_length=100,   
                device=0 if torch.cuda.is_available() else -1
            )
        ), AutoTokenizer.from_pretrained("google-t5/t5-base")
    elif model_name == "BigBird":
        bigbird_pipeline = pipeline(
            "summarization",
            model="google/bigbird-pegasus-large-arxiv",
            tokenizer="google/bigbird-pegasus-large-arxiv",
            max_length=1024,  
            min_length=30,
            device=0 if torch.cuda.is_available() else -1
        )
        pipe = HuggingFacePipeline(pipeline=bigbird_pipeline)
        bigbird_tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")
        return pipe, bigbird_tokenizer
    raise ValueError(f"Unknown model: {model_name}")

## Section 4: Building RAG Chain

In [None]:
from rank_bm25 import BM25Okapi
from langchain_core.runnables import RunnablePassthrough

class BM25Retriever:

    def __init__(self, documents):
        self.documents = documents
        self.corpus = [doc.page_content for doc in documents]

        self.tokenize_corpus = [doc.split() for doc in self.corpus]

        self.bm25 = BM25Okapi(self.tokenize_corpus)


    def retrieve(self, query, k=5):

        tokenized_query = query.split()

        top_documents = self.bm25.get_top_n(tokenized_query, self.corpus, n=k)

        return top_documents


In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document

def build_faiss(documents: list[Document]) -> FAISS:

    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
    
    vector_store = FAISS.from_documents(documents, embeddings)

    return vector_store


In [None]:
from langchain.schema import Document

class EnsembleRetriever:

    def __init__(self, faiss_retriever, bm25_retriever):
        self.faiss_retriever = faiss_retriever
        self.bm25_retriever = bm25_retriever

    def get_relevant_documents(self, query: str, k: int = 5):
        faiss_count = len(self.faiss_retriever.index_to_docstore_id)
        bm25_count = len(self.bm25_retriever.corpus)
        adjusted_k = min(k, faiss_count, bm25_count)
        faiss_docs =  self.faiss_retriever.similarity_search(query,k=adjusted_k)

        bm25_docs = self.bm25_retriever.retrieve(query,k=adjusted_k)

        combined = faiss_docs + bm25_docs

        seen = set()
        unique_docs = []
        for doc in combined:
            content = doc.page_content if isinstance(doc, Document) else doc

            key = content[:60]

            if key not in seen:
                if isinstance(doc, str):
                    doc = Document(page_content=doc)
                unique_docs.append(doc)
                seen.add(key)

        return unique_docs[:k]


In [None]:
from langchain.prompts import PromptTemplate

def format_docs(docs):
    if not docs:
        return "No relevant context found"

    snippet_list = []

    for i, doc in enumerate(docs):

      content = doc.page_content.strip().replace("\n"," ").replace("\r", " ")
      snippet_list.append(f"{i+1}. {content}")

    return "\n".join(snippet_list)


style_prompt = PromptTemplate(
    input_variables=["style", "context", "original_text"],
    template="""
    Summarize the following text in a {style} style:
    Context: {context}
    Original Text: {original_text}
    Summary:
    """
)


In [None]:
from langchain_core.runnables import RunnablePassthrough
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
import time
from langchain.prompts import PromptTemplate

def chunk_text(text, chunk_size=2000, overlap=200):
    """Splits text into overlapping chunks for summarization."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    return text_splitter.split_text(text)

def build_rag_chain(llm, tokenizer, faiss_retriever, bm25_retriever):

    ensemble_retriever = EnsembleRetriever(faiss_retriever=faiss_retriever,bm25_retriever=bm25_retriever)

    def retrieve_and_format_context(query, k=5):
        context_docs =  ensemble_retriever.get_relevant_documents(query,k=k)

        context = format_docs(context_docs)
        return context

    def rag_chain(inputs, model_name):
        question = inputs["question"]
        style = inputs["style"]
        context = retrieve_and_format_context(question)

        if not context:
            print(f"{model_name}: No valid context, returning empty summary")
            return ""

        max_input_tokens = 400 if model_name == "T5-Base" else 2048 if model_name=="BigBird" else 700
        if tokenizer is not None:
            tokens = tokenizer(context, return_tensors="pt", truncation=False)
            token_count = tokens.input_ids.shape[1]
            max_token_id = tokens.input_ids.max().item()
            if max_token_id >= tokenizer.vocab_size:
                return ""
            if token_count > max_input_tokens:
                tokens = tokens.input_ids[:, :max_input_tokens]
                context = tokenizer.decode(tokens[0], skip_special_tokens=True)
            token_count = tokenizer(context, return_tensors="pt").input_ids.shape[1]
            
        if "T5" in model_name:
            prompt_template = "summarize: {context}\n\nQuestion: {question}\nStyle: {style}"
        else:
            prompt_template = "{context}\n\nQuestion: {question}\nStyle: {style}"

        prompt = PromptTemplate(
            input_variables=["context", "question", "style"],
            template=prompt_template
        ).format(context=context, question=question, style=style)

        if tokenizer is not None:
            tokens = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_input_tokens)
            final_tokens = tokens.input_ids.shape[1]
            max_token_id = tokens.input_ids.max().item()
            if max_token_id >= tokenizer.vocab_size:
                return ""
            prompt = tokenizer.decode(tokens.input_ids[0], skip_special_tokens=True)
        
        time.sleep(10)
        try:
            if model_name == "MistralAI":
                # Extract generated text for Mistral
                result = llm.invoke(prompt)
                if isinstance(result, list) and result:
                    result = result[0].get("generated_text", "")
                elif isinstance(result, dict):
                    result = result.get("generated_text", "")
            else:
                result = llm.invoke(prompt)
        except Exception as e:
            print(f"{model_name} Invoke Failed: {str(e)}")
            return ""
        return result

    return rag_chain


## Section 5: Extracting Methodology Sections

In [None]:
import re
from PyPDF2 import PdfReader

def convert_pdf_to_text(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

def extract_methodology_section(text):
    methodology_patterns = [
        r"(?i)\b(methodology|methods|materials and methods|study design|experimental setup|approach)\b"
    ]
    methodology_start = None

    for pattern in methodology_patterns:
        match = re.search(pattern, text)
        if match:
            methodology_start = match.start()
            break

    if methodology_start is None:
        return "No Methodology section found."

    text_after_methodology = text[methodology_start:]

    end_section_patterns = [
        r"(?i)\n(results|discussion|conclusion|acknowledgments|references)\b"
    ]
    for pattern in end_section_patterns:
        end_match = re.search(pattern, text_after_methodology)
        if end_match:
            return text_after_methodology[:end_match.start()].strip()

    return text_after_methodology.strip()  

## Section 6: DAG Generation

In [None]:
import re
import json

def clean_node_name(name):
    """
    Clean node names by removing special characters, normalizing spaces, and stripping.
    """
    if not isinstance(name, str):
        return str(name)
    name = name.strip()
    name = re.sub(r'\s+', ' ', name)
    return name

def clean_category(category):
    """
    Clean category names by normalizing spaces.
    """
    if not isinstance(category, str):
        return str(category)
    category = category.strip()
    category = re.sub(r'\s+', ' ', category)
    return category

def is_ml_component(node_name, category):
    """
    Determine if a node is an ML component based on name and category.
    Excludes paper sections like 'Introduction', 'Results', etc.
    """
    non_ml_keywords = {'introduction', 'results', 'conclusion', 'discussion', 'abstract', 'related work'}
    node_name_lower = node_name.lower()
    category_lower = category.lower()
    return not any(keyword in node_name_lower for keyword in non_ml_keywords) and \
           not any(keyword in category_lower for keyword in non_ml_keywords)

def extract_json_block(text):
    """
    Extract the first valid JSON block from text by iteratively parsing substrings.
    Handles nested JSON structures and common formatting issues.
    """
    text = text.strip()
    start_indices = [i for i, char in enumerate(text) if char == '{']
    if not start_indices:
        return None

    for start_idx in start_indices:
        brace_count = 0
        candidate = ""
        for idx in range(start_idx, len(text)):
            char = text[idx]
            candidate += char
            if char == '{':
                brace_count += 1
            elif char == '}':
                brace_count -= 1
                if brace_count == 0:
                    try:
                        # Clean common JSON issues
                        cleaned_candidate = candidate
                        cleaned_candidate = re.sub(r'//.*?\n|/\*.*?\*/', '', cleaned_candidate, flags=re.DOTALL)  # Remove comments
                        cleaned_candidate = re.sub(r',\s*([\]\}])', r'\1', cleaned_candidate)  # Remove trailing commas
                        cleaned_candidate = cleaned_candidate.replace("'", '"')  # Replace single quotes
                        cleaned_candidate = re.sub(r'([{,\s])(\w+)(?=\s*:)', r'\1"\2"', cleaned_candidate)  # Quote unquoted keys
                        json.loads(cleaned_candidate)
                        return cleaned_candidate
                    except json.JSONDecodeError:
                        break  # Try next start index if parsing fails
    return None

def extract_pipeline(summary):
          
    prompt = f"""
          Given the following machine learning pipeline summary:{summary}
            Format the pipeline as a directed acyclic graph (DAG) where:
            Nodes represent Datasets, Data Processing Methods(if available any), Algorithms, and Evaluation Metrics and must contain name, inputs(edge coming from other nodes, [] if null), category only .
            Edges represent the sequence of steps in the pipeline and should be a list which contains dictionary object of source and target string.
            The output should be in a single JSON structure containing all nodes and edges.
          """
    model, tokenizer =  setup_llm("Llama")
    
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=1500,
        do_sample=True,
        temperature=0.3 
    )
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)

    json_str = extract_json_block(result)
    print("Json String", json_str)
    if not json_str:
        print("No valid JSON block found in LLM output")
        return {"nodes": [], "edges": []}

    try:
        dag = json.loads(json_str)
    except json.JSONDecodeError as e:
        print(f"JSON parsing failed: {str(e)}")
        lines = json_str.splitlines()
        error_line = min(len(lines), max(1, 1))
        return {"nodes": [], "edges": []}

    # Normalize DAG keys
    normalized_dag = {}
    for key, value in dag.items():
        normalized_key = key.lower()
        if normalized_key in ['nodes', 'node']:
            normalized_dag['nodes'] = value
        elif normalized_key in ['edges', 'edge']:
            normalized_dag['edges'] = value

    if 'nodes' not in normalized_dag:
        normalized_dag['nodes'] = []
    if 'edges' not in normalized_dag:
        normalized_dag['edges'] = []

    if not isinstance(normalized_dag['nodes'], list):
        normalized_dag['nodes'] = []
    if not isinstance(normalized_dag['edges'], list):
        normalized_dag['edges'] = []

    cleaned_nodes = []
    for node in normalized_dag['nodes']:
        if not isinstance(node, dict) or 'name' not in node:
            continue
        node['name'] = clean_node_name(node['name'])
        node['category'] = clean_category(node.get('category', 'Unknown'))
        if not is_ml_component(node['name'], node['category']):
            continue
        cleaned_nodes.append(node)
    
    normalized_dag['nodes'] = cleaned_nodes
    node_names = set(node['name'] for node in normalized_dag['nodes'])

    cleaned_edges = []
    for edge in normalized_dag['edges']:
        if isinstance(edge, dict) and 'source' in edge and 'target' in edge:
            source = clean_node_name(edge['source'])
            target = clean_node_name(edge['target'])
            if source in node_names and target in node_names:
                cleaned_edges.append({"source": source, "target": target})

    normalized_dag['edges'] = cleaned_edges

    existing_edges = {(edge['source'], edge['target']) for edge in normalized_dag['edges']}
    inferred_edges = []

    for node in normalized_dag['nodes']:
        node_name = node['name']
        inputs = node.get('inputs', [])
        for input_node in inputs:
            cleaned_input = clean_node_name(input_node)
            if cleaned_input in node_names and (cleaned_input, node_name) not in existing_edges:
                inferred_edges.append({
                    "source": cleaned_input,
                    "target": node_name
                })

    normalized_dag['edges'].extend(inferred_edges)

    valid_nodes = set(node['name'] for node in normalized_dag['nodes'])
    normalized_dag['edges'] = [
        edge for edge in normalized_dag['edges']
        if edge['source'] in valid_nodes and edge['target'] in valid_nodes
    ]
    return normalized_dag

## Section 7: Pipeline Visualization

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

def show_pipeline(graph_data):
    try:
        if 'nodes' not in graph_data or 'edges' not in graph_data:
            print("Missing 'nodes' or 'edges' in graph data")
            return
        
        G = nx.DiGraph()
        category_colors = {'Unknown': 'gray'}
        color_cycle = cm.tab20(np.linspace(0, 1, 20))
        color_index = 0

        for node in graph_data['nodes']:
            category = node.get('category', 'Unknown').rstrip('s')
            if category and category != 'Unknown' and category not in category_colors:
                if color_index < len(color_cycle):
                    new_color = tuple(color_cycle[color_index])
                    new_color_hex = f"#{int(new_color[0]*255):02x}{int(new_color[1]*255):02x}{int(new_color[2]*255):02x}"
                    category_colors[category] = new_color_hex
                    color_index += 1
                else:
                    new_color = f"#{np.random.randint(0, 255):02x}{np.random.randint(0, 255):02x}{np.random.randint(0, 255):02x}"
                    category_colors[category] = new_color


        for node in graph_data['nodes']:
            node_id = node.get('name')
            if not node_id:
                continue
            category = node.get('category', 'Unknown').rstrip('s')
            display_label = (node_id[:30] + '...') if len(node_id) > 30 else node_id
            G.add_node(node_id, category=category, label=display_label)

        for edge in graph_data['edges']:
            source = edge.get('source')
            target = edge.get('target')
            if source and target:
                G.add_edge(source, target)
    
        node_colors = [category_colors.get(G.nodes[node].get('category', 'Unknown'), 'gray') for node in G.nodes]

        def get_node_levels(G):
            levels = {}
            def assign_level(node, level):
                if node not in levels or level > levels[node]:
                    levels[node] = level
                    for successor in G.successors(node):
                        assign_level(successor, level + 1)
            for node in G.nodes:
                if G.in_degree(node) == 0:
                    assign_level(node, 0)
            return levels

        levels = get_node_levels(G)
        pos = {}
        max_level = max(levels.values()) if levels else 0
        for node in G.nodes:
            level = levels.get(node, 0)
            nodes_at_level = [n for n in G.nodes if levels.get(n, 0) == level]
            x = len(nodes_at_level)
            x_pos = nodes_at_level.index(node) / max(1, x - 1) if x > 1 else 0.5
            y_pos = 1 - (level / max_level) if max_level > 0 else 0.5
            pos[node] = (x_pos, y_pos)

        plt.figure(figsize=(15, 12))
        nx.draw(
            G,
            pos,
            with_labels=True,
            labels={node: G.nodes[node].get('label', node) for node in G.nodes},
            node_color=node_colors,
            node_size=4000,
            font_size=8,
            font_weight='bold',
            edge_color='gray',
            arrowsize=20
        )

        legend_labels = {category: plt.Line2D([0], [0], marker='o', color='w', label=category,
                                              markerfacecolor=color, markersize=10)
                         for category, color in category_colors.items()}
        plt.legend(handles=legend_labels.values(), title="Node Categories", loc='best')

        plt.title("Hierarchical Pipeline DAG Visualization", fontsize=14, pad=20)
        plt.savefig('hierarchical_graph.png', bbox_inches='tight')
        plt.show()

    except Exception as e:
        print(f"Plot not available due to {e}")

## Section 8: BERTScore Evaluation

In [None]:
import torch
import json
import datetime
import platform
from bert_score import score


def evaluate_bert_score(methodology_section, summaries):
  torch.manual_seed(42)
  scores = {}
  model_type = "roberta-large"
  original_text = methodology_section

  for model, summary in summaries.items():
    P, R, F1 = score([summary], [original_text], model_type=model_type)
    summary_scores = {
        "Precision": round(P.item(), 4),
        "Recall": round(R.item(), 4),
        "F1 Score": round(F1.item(), 4)
    }
    scores[model] = summary_scores

  best_model = max(scores, key=lambda k: scores[k]["F1 Score"])
  best_summary = summaries[best_model]
  print(f"Best Model: {best_model}")
  print(f"Score: {scores[best_model]}")
  return best_model, best_summary, scores[best_model]


## Section 9: Summary Generation

In [None]:
import torch
import gc

def generate_summary(faiss_retriever,bm25_retriever):
  summaries = {}
  model_names = ["T5-Base", "MistralAI", "BART-Base", "BigBird"]
  for model_name in model_names:
        print(f"Running {model_name}...")
        llm,tokenizer = setup_llm(model_name)
        rag_chain = build_rag_chain(llm=llm, tokenizer=tokenizer, faiss_retriever=faiss_retriever, bm25_retriever=bm25_retriever)
        user_text = "Summarize the research paper, focusing on which and how the algorithms, datasets, data analytics methods, and evaluation metrics used, excluding references. Do not return the content as it is return the summary"
        target_style = "in format of paragraph summary of around 500 words"
        inputs = {"question": user_text, "style": target_style, "original_text": user_text}
        try:
            styled_result = rag_chain(inputs, model_name)
        except Exception as e:
            print(f"{model_name} Failed: {str(e)}")
            styled_result = ""
        summaries[model_name] = styled_result

        if model_name != "MistralAI":
            del llm
            if tokenizer is not None:
                del tokenizer
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()
  return summaries

## Section 10: Pipeline Evaluation

In [None]:
import json
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import re
from sentence_transformers import SentenceTransformer, util
from Levenshtein import distance as levenshtein_distance
from networkx.algorithms.similarity import graph_edit_distance

def compute_normalized_levenshtein(ref_nodes, gen_nodes):

    similarities = []
    
    ref_nodes_by_category = {}
    gen_nodes_by_category = {}
    for node in ref_nodes:
        category = clean_category(node.get('category', 'Unknown'))
        ref_nodes_by_category.setdefault(category, []).append(node)
    for node in gen_nodes:
        category = clean_category(node.get('category', 'Unknown'))
        gen_nodes_by_category.setdefault(category, []).append(node)
    
    for category in ref_nodes_by_category:
        ref_cat_nodes = ref_nodes_by_category.get(category, [])
        gen_cat_nodes = gen_nodes_by_category.get(category, [])
        if not gen_cat_nodes:
            continue
        
        for ref_node in ref_cat_nodes:
            best_similarity = 0
            best_match = None
            ref_name = clean_node_name(ref_node['name'])
            
            for gen_node in gen_cat_nodes:
                gen_name = clean_node_name(gen_node['name'])
                lev_distance = levenshtein_distance(ref_name, gen_name)
                max_length = max(len(ref_name), len(gen_name))
                normalized_distance = lev_distance / max_length if max_length > 0 else 0
                similarity = 1 - normalized_distance
                if similarity > best_similarity:
                    best_similarity = similarity
                    best_match = gen_name
            
            if best_match:
                similarities.append(best_similarity)
    
    return np.mean(similarities) if similarities else 0.0


bert_model = SentenceTransformer('all-MPNet-base-v2')

def normalize_node_labels(graph):
   
    nodes = graph['nodes']
    name_map = {}
    
    for node in nodes:
        name = node['name']
        if name in name_map:
            continue
        for other_node in nodes:
            if other_node['name'] in name_map:
                continue
            if node['name'] != other_node['name']:
                similarity = util.cos_sim(
                    bert_model.encode(node['name']),
                    bert_model.encode(other_node['name'])
                ).item()
                if similarity > 0.9:  
                    name_map[other_node['name']] = name
    
    for node in nodes:
        if node['name'] in name_map:
            node['name'] = name_map[node['name']]
    
    return graph

def filter_non_pipeline_nodes(graph):
    pipeline_categories = {'Dataset', 'DataProcessing', 'Algorithm', 'EvaluationMetric'}
    filtered_nodes = [
        node for node in graph['nodes']
        if node.get('category') in pipeline_categories
    ]
    
    filtered_node_names = {node['name'] for node in filtered_nodes}
    for node in filtered_nodes:
        node['inputs'] = [inp for inp in node['inputs'] if inp in filtered_node_names]
    
    return {'nodes': filtered_nodes}

def create_nx_graph(dag):
    G = nx.DiGraph()
    for node in dag['nodes']:
        G.add_node(node['name'], label=node['name'], category=node.get('category', 'Unknown'))
    for node in dag['nodes']:
        for input_name in node['inputs']:
            if input_name in G.nodes:
                G.add_edge(input_name, node['name'])
    return G

def approximate_ged(G1, G2):
    nodes1 = list(G1.nodes(data=True))
    nodes2 = list(G2.nodes(data=True))

    sim_matrix = np.zeros((len(nodes1), len(nodes2)))
    for i, (n1, d1) in enumerate(nodes1):
        for j, (n2, d2) in enumerate(nodes2):
            label_sim = util.cos_sim(
                bert_model.encode(d1['label']),
                bert_model.encode(d2['label'])
            ).item()
            category_sim = 1.0 if d1['category'] == d2['category'] else 0.5
            sim_matrix[i, j] = label_sim * category_sim
    
    matched = set()
    matches = []
    for i in range(len(nodes1)):
        if i >= len(nodes2):
            break
        j = np.argmax(sim_matrix[i, :])
        if j not in matched and sim_matrix[i, j] > 0.5:
            matches.append((nodes1[i][0], nodes2[j][0]))
            matched.add(j)
    
    node_cost = len(nodes1) + len(nodes2) - 2 * len(matches)
    edge_cost = 0
    for u1, u2 in matches:
        edges1 = set(G1.out_edges(u1)).union(G1.in_edges(u1))
        edges2 = set(G2.out_edges(u2)).union(G2.in_edges(u2))
        edge_cost += len(edges1.symmetric_difference(edges2))
    
    return node_cost + edge_cost

def exact_ged(G1, G2):
    def node_subst_cost(n1, n2):
        label_sim = util.cos_sim(
            bert_model.encode(n1['label']),
            bert_model.encode(n2['label'])
        ).item()
        category_sim = 0.0 if n1['category'] == n2['category'] else 1.0
        return 1.0 - label_sim + category_sim
    
    def node_del_cost(n):
        return 1.0
    
    def node_ins_cost(n):
        return 1.0
    
    def edge_subst_cost(e1, e2):
        return 0.0 if e1 == e2 else 1.0
    
    def edge_del_cost(e):
        return 1.0
    
    def edge_ins_cost(e):
        return 1.0
    
    return nx.graph_edit_distance(
        G1, G2,
        node_subst_cost=node_subst_cost,
        node_del_cost=node_del_cost,
        node_ins_cost=node_ins_cost,
        edge_subst_cost=edge_subst_cost,
        edge_del_cost=edge_del_cost,
        edge_ins_cost=edge_ins_cost
    )

def compute_ged(dag_pair):
    dag1, dag2 = dag_pair
    dag1 = normalize_node_labels(dag1)
    dag2 = normalize_node_labels(dag2)
    dag1 = filter_non_pipeline_nodes(dag1)
    dag2 = filter_non_pipeline_nodes(dag2)
    
    G1 = create_nx_graph(dag1)
    G2 = create_nx_graph(dag2)
    
    if len(G1.nodes) > 15 or len(G2.nodes) > 15:
        return approximate_ged(G1, G2)
    else:
        return exact_ged(G1, G2)

def evaluate_pipeline(ref_dag, gen_dag):
    
    ged = compute_ged((ref_dag, gen_dag))
    levenshtein_similarity = compute_normalized_levenshtein(ref_dag['nodes'], gen_dag['nodes'])

    return {
        'ged': ged,
        'levenshtein_similarity': levenshtein_similarity
    }

def visualize_evaluation(results):
    
    metrics = ['normalized_ged', 'levenshtein_similarity']
    n_pipelines = len(results)
    
    plt.figure(figsize=(10, 6))
    bar_width = 0.25
    index = np.arange(n_pipelines)

    x_labels = list(range(1, n_pipelines + 1))
    
    for i, metric in enumerate(metrics):
        values = [r[4][metric] for r in results]
        plt.bar(index + i * bar_width, values, bar_width, label=metric)
    
    plt.xlabel('Pipelines')
    plt.ylabel('Scores')
    plt.title('Pipeline Evaluation Metrics')
    plt.xticks(index + bar_width, x_labels, rotation=45, ha='right')
    plt.legend()
    plt.tight_layout()
    plt.savefig('evaluation_metrics_ged_levenshtein.png')
    plt.show()

In [None]:
import torch
import gc
import pandas as pd
import numpy as np

if __name__ == "__main__":
    df = pd.read_csv("/kaggle/input/pipeline-extract/Pipeline_Dataset - Sheet1.csv")
    results = []
    titles = []
    
    for _, row in df.iterrows():
        title = row['Title']
        content = row['Content']
        ref_pipeline = json.loads(row['pipeline'])

        all_docs = split_text_into_documents(content)
        faiss_retriever = build_faiss(all_docs)
        bm25_retriever = BM25Retriever(all_docs)
        summaries = generate_summary(faiss_retriever,bm25_retriever)
        best_model, best_summary, bert_score = evaluate_bert_score(content, summaries)
        pipeline_generated = extract_pipeline(best_summary)
        show_pipeline(pipeline_generated)
        pipeline_evaluation = evaluate_pipeline(ref_pipeline, pipeline_generated)
        results.append([title, best_summary, bert_score, pipeline_generated, pipeline_evaluation])
        
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()
           
        print(f"Evaluation for '{title}':")
        print(json.dumps(pipeline_evaluation, indent=2))
        

    results_df = pd.DataFrame(results, columns=["Title", "Summary", "BERTScore", "Pipeline", "PipelineEvaluation"])
    results_df.to_csv("results.csv", index=False)
    
    avg_metrics = {
        'normalized_ged': np.mean([r[4]['normalized_ged'] for r in results]),
        'levenshtein_similarity': np.mean([r[4]['levenshtein_similarity'] for r in results]),
    }
    
    print("\nAverage Metrics Across Pipelines:")
    print(json.dumps(avg_metrics, indent=2))
    
    visualize_evaluation(results)

In [None]:
visualize_evaluation(results)

In [None]:
import torch
import gc
import pandas as pd

if __name__ == "__main__":

    df = pd.read_csv("arxiv_papers.csv")
    results = []
    temp_pdf_path = "research_paper.pdf"
    i=1

    for _,row in df.iterrows():
        title, pdf_url = row["Title"], row["PDF Link"]
        print(f"{i}. {title}")
        pdf_path = download_pdf_from_link(pdf_url,temp_pdf_path)
        text = convert_pdf_to_text(pdf_path)
        methodology_section = extract_methodology_section(text)
        all_docs = split_text_into_documents(methodology_section)
        faiss_retriever = build_faiss(all_docs)
        bm25_retriever = BM25Retriever(all_docs)
        summaries = generate_summary(faiss_retriever,bm25_retriever)
        best_model, best_summary, bert_score = evaluate_bert_score(methodology_section, summaries)
        formatted_summary = extract_pipeline(best_summary)
        show_pipeline(formatted_summary)
        results.append([title, pdf_url, best_model, best_summary, formatted_summary, bert_score])
        i=i+1
        if os.path.exists(temp_pdf_path):
          os.remove(temp_pdf_path)

        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()

    results_df = pd.DataFrame(results, columns=["Title", "PDF Link", "Best Model", "Summary", "Pipeline", "BERTScore"])
    results_df.to_csv("results.csv", index=False)

In [None]:
import itertools
import json

pipeline_components = {
    "Datasets Used": ["COCO", "ImageNet", "MNIST", "CIFAR-10", "Pascal VOC", "UCI ML Repository"],
    "Data Analytics Methods Applied": ["Principal Component Analysis (PCA)", "t-SNE for Visualization",
                                       "Statistical Feature Selection", "Data Augmentation", "Anomaly Detection"],
    "Algorithms Implemented": ["YOLOv12", "ResNet-50", "BERT Transformer", "XGBoost", "Random Forest", "LSTM Networks"],
    "Evaluation Metrics Used": ["mAP@50-95", "F1-score", "Accuracy", "ROC-AUC", "Mean Squared Error (MSE)"]
}

all_pipelines = list(itertools.product(*pipeline_components.values()))

structured_pipelines = [
    {
        "Datasets Used": pipeline[0],
        "Data Analytics Methods Applied": pipeline[1],
        "Algorithms Implemented": pipeline[2],
        "Evaluation Metrics Used": pipeline[3]
    }
    for pipeline in all_pipelines
]

with open("all_research_pipelines.json", "w") as f:
    json.dump(structured_pipelines, f, indent=4)

for i, pipeline in enumerate(structured_pipelines[:5]):
    print(f"Pipeline {i+1}:")
    for step, method in pipeline.items():
        print(f"  {step}: {method}")
    print("\n")

print(f"Total Pipelines Generated: {len(structured_pipelines)}")
