In [1]:
import pandas as pd
import time
import openai
import re
from openai import OpenAI
import spacy
import requests
from tqdm import tqdm  # ✅ Add this for progress bar

In [None]:
OPENAI_API_KEY = "my_key1"
ANTHROPIC_API_KEY = "my_key2"

In [3]:
# ----------------------------------------
# 🔧 Configuration
# ----------------------------------------
API_KEY = OPENAI_API_KEY  # Replace with your actual key
MODEL = "gpt-4.1-mini"
INPUT_CSV = "../Extraction/stage 3/methods_with_verified_methods_list.csv"
OUTPUT_CSV = "stage 1/Masked Research Idea & Objective Extraction.csv"
SLEEP_BETWEEN_REQUESTS = 1

client = OpenAI(api_key=API_KEY)

# ----------------------------------------
# 🧠 Prompt Generator
# ----------------------------------------
def make_masked_summary_prompt(abstract, extracted_methods, domain):
    clean_methods = re.findall(r"<method>(.*?)</method>", extracted_methods)
    methods_str = ", ".join(clean_methods) if clean_methods else "none"

    return f"""
As a research assistant, your task is to extract the research idea and research objective from an abstract which is from the {domain} domain. The extracted content must contain the:

1. Research Idea: Write 2-4 sentences that capture the overarching motivation or problem the study addresses, using the text from the abstract.
2. Research Objective: Write 2-4 sentences that directly state the study’s primary aim or objective, using the text from the abstract.
3. Your response should not mention any Computer Science (CS), Artificial Intelligence (AI), Machine Learning (ML), Deep Learning (DL) related terms, words or phrases including vague or generic statements such as "algorithm," "model," "computational," "data-driven," "predictive," "system," or "analysis."
4. Your response should not mention any methods, techniques, or tools related to CS/AI/ML/DL, even if they are mentioned in the abstract.
5. Focus exclusively on the {domain}-specific context, problem, or purpose of the research for both the research idea and research objective.
6. If the abstract does not explicitly state the idea or objective, extract the closest equivalent statement describing the study’s motivation (for idea) or primary aim (for objective), staying as close to the original wording as possible.

Here is the abstract: {abstract}

Provide the extracted content below in the following format as a whole paragraph.
"""

# ----------------------------------------
# 🚀 OpenAI API Wrapper
# ----------------------------------------
def generate_summary(prompt, model=MODEL, temperature=0.1):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=temperature,
        max_tokens=800
    )
    return response.choices[0].message.content.strip()

# ----------------------------------------
# 🔁 Main Processing Loop with Progress Bar
# ----------------------------------------
def summarize_dataset(input_path, output_path):
    df = pd.read_csv(input_path)
    results = []

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="🔄 Processing"):  # ✅ Added tqdm
        abstract = str(row["abstract"])
        methods = str(row["verified_methods_list"])
        domain = str(row["domain"])

        try:
            prompt = make_masked_summary_prompt(abstract, methods, domain)
            masked_extraction = generate_summary(prompt)

            results.append({
                "domain": domain,
                "title": row["title"],
                "abstract": abstract,
                "verified methods list": methods,
                "masked extraction": masked_extraction
            })

            time.sleep(SLEEP_BETWEEN_REQUESTS)

        except Exception as e:
            print(f"❌ Error on row {idx}: {e}")
            continue

    df_out = pd.DataFrame(results)
    df_out.to_csv(output_path, index=False)
    print(f"\n✅ All done! Saved summaries to: {output_path}")

In [4]:
# ----------------------------------------
# 🎬 Run
# ----------------------------------------
if __name__ == "__main__":
    summarize_dataset(INPUT_CSV, OUTPUT_CSV)

🔄 Processing: 100%|█████████████████████████████████████████████████████████████████| 792/792 [41:11<00:00,  3.12s/it]


✅ All done! Saved summaries to: stage 1/Masked Research Idea & Objective Extraction.csv





In [None]:
# -------------------------------
# 🔧 Configuration
# -------------------------------
API_KEY = "my_key2"
API_URL = "https://api.anthropic.com/v1/messages"
HEADERS = {
    "x-api-key": API_KEY,
    "anthropic-version": "2023-06-01",
    "content-type": "application/json"
}

MODEL = "claude-3-7-sonnet-20250219"
INPUT_CSV = "stage 1/Masked Research Idea & Objective Extraction.csv"
OUTPUT_CSV_ALL = "stage 2/rewritten Masked Extraction.csv"
OUTPUT_CSV_CLEAN = "stage 2/leakage free summaries only.csv"
OUTPUT_CSV_LEAKED = "stage 2/leaked summaries only.csv"
SLEEP_BETWEEN_REQUESTS = 1

# -------------------------------
# 🧠 AI/ML/DL Vocabulary List
# -------------------------------
term_list = [
    "Support Vector Machine", "Decision Tree", "Random Forest", "Naive Bayes",
    "K-Nearest Neighbors", "Logistic Regression", "Linear Regression", "Gradient Boosting",
    "XGBoost", "LightGBM", "CatBoost", "AdaBoost", "K-Means", "DBSCAN",
    "Hierarchical Clustering", "PCA", "LDA", "Q-Learning", "DQN",
    "GAN", "VAE", "CNN", "RNN", "LSTM", "GRU", "Transformer", "BERT",
    "GPT", "T5", "LLaMA", "Mistral", "Claude", "Autoencoder", "MLP",
    "ResNet", "Inception", "VGG", "EfficientNet", "YOLO", "SSD", "Faster R-CNN",
    "Deep Reinforcement Learning", "Meta-learning", "Few-shot Learning", "Zero-shot Learning",
    "Federated Learning", "Self-supervised Learning", "Semi-supervised Learning",
    "Supervised Learning", "Unsupervised Learning", "Transfer Learning", "Representation Learning",
    "learning algorithm", "computational model", "prediction model", "automated method",
    "data-driven technique", "neural approach", "intelligent system", "adaptive model",
    "classification technique", "regression approach", "optimization-based method",
    "probabilistic framework", "generative model", "discriminative model", "hybrid system",
    "data analytics", "pattern recognition technique", "predictive pipeline",
    "algorithmic solution", "high-capacity model", "state-of-the-art method",
    "novel approach", "ensemble method", "custom architecture", "algorithmic strategy",
    "smart system", "scalable framework", "dynamic learning scheme", "decision-making model",
    "analytical model", "computational strategy",
    "advanced computing", "intelligent analysis", "automated detection",
    "computational intelligence", "smart detection", "enhanced recognition",
    "autonomous system", "technology-driven approach", "model-based analysis",
    "hybrid learning", "deep representation", "intelligent estimation",
    "self-learning mechanism", "automatic classification", "high-performance model",
    "synthetic model generation", "context-aware learning", "domain adaptation",
    "feature extraction technique", "latent variable model",
    "probabilistic inference", "statistical modeling", "parameter tuning",
    "optimization routine", "hyperparameter selection", "loss minimization",
    "backpropagation", "likelihood estimation", "training and testing phase",
    "evaluation metrics", "convergence behavior", "stochastic process",
    "sampling techniques", "cross-validation", "gradient descent",
    "regularization method", "overfitting prevention", "feature engineering",
    "dimensionality reduction", "noise reduction technique"
]

# -------------------------------
# 🧠 Rewrite Prompt Template
# -------------------------------
def make_rewrite_prompt(summary, term_list):
    terms = ", ".join(term_list)
    return f"""
You are reviewing an extracted summary describing the research idea and objective of a paper. The summary should be free of references related to Artificial Intelligence (AI), Machine Learning (ML), Deep Learning (DL), or any related computational terms/phrases.

These references can be direct, vague, euphemisms, or synonyms terms/phrases such as: {terms}

Your task:
1. Go sentence by sentence.
2. If a sentence contains any such reference, rewrite it to remove or neutralize the term while keeping the original research idea or objective intact.
3. If a sentence is already clean of any Artificial Intelligence (AI), Machine Learning (ML), Deep Learning (DL) language, keep it unchanged.
4. At the end, return a clean paragraph that captures the same research idea and objective without any AI/ML/DL-related language.

Only return the final cleaned paragraph. Do not explain your changes.

Here is the summary:
{summary}
"""

# -------------------------------
# 🔁 Main Processing Loop
# -------------------------------
def detect_and_rewrite(input_csv, output_csv_all, output_csv_clean, output_csv_leaked):
    df = pd.read_csv(input_csv)
    results = []

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="🔁 Rewriting Summaries"):  # ✅ Add progress bar
        abstract = str(row["abstract"])
        methods = str(row["verified methods list"])
        masked_extraction = str(row["masked extraction"])

        try:
            prompt = make_rewrite_prompt(masked_extraction, term_list)
            payload = {
                "model": MODEL,
                "max_tokens": 1024,
                "messages": [{"role": "user", "content": prompt}]
            }

            response = requests.post(API_URL, headers=HEADERS, json=payload)
            response.raise_for_status()
            cleaned_summary = response.json()["content"][0]["text"].strip()

            leakage_detected = masked_extraction.strip() != cleaned_summary

            results.append({
                "domain": row.get("domain", ""),
                "title": row.get("title", ""),
                "abstract": abstract,
                "verified methods list": methods,
                "inital masked extraction": masked_extraction,
                "final masked extraction": cleaned_summary,
                "leakage detected": leakage_detected
            })

            time.sleep(SLEEP_BETWEEN_REQUESTS)

        except Exception as e:
            print(f"❌ Error on row {idx}: {e}")
            continue

    # Save all results
    df_out = pd.DataFrame(results)
    df_out.to_csv(output_csv_all, index=False)
    print(f"\n✅ All results saved to: {output_csv_all}")

    # Save clean summaries
    clean_df = df_out[df_out["leakage detected"] == False]
    clean_df.to_csv(output_csv_clean, index=False)
    print(f"✅ Clean summaries saved to: {output_csv_clean} ({len(clean_df)} rows)")

    # Save rewritten summaries
    leaked_df = df_out[df_out["leakage detected"] == True]
    leaked_df.to_csv(output_csv_leaked, index=False)
    print(f"🚨 Rewritten (previously leaked) summaries saved to: {output_csv_leaked} ({len(leaked_df)} rows)")

In [6]:
# -------------------------------
# 🎬 Run
# -------------------------------
if __name__ == "__main__":
    detect_and_rewrite(INPUT_CSV, OUTPUT_CSV_ALL, OUTPUT_CSV_CLEAN, OUTPUT_CSV_LEAKED)

🔁 Rewriting Summaries:   6%|███▍                                                   | 49/792 [04:38<1:00:34,  4.89s/it]

❌ Error on row 48: 529 Server Error:  for url: https://api.anthropic.com/v1/messages


🔁 Rewriting Summaries:  29%|███████████████▌                                      | 229/792 [22:44<1:25:09,  9.08s/it]

❌ Error on row 228: 529 Server Error:  for url: https://api.anthropic.com/v1/messages


🔁 Rewriting Summaries: 100%|██████████████████████████████████████████████████████| 792/792 [1:12:06<00:00,  5.46s/it]


✅ All results saved to: stage 2/Rewritten Masked Extraction.csv
✅ Clean summaries saved to: stage 2/leakage free summaries only.csv (377 rows)
🚨 Rewritten (previously leaked) summaries saved to: stage 2/leaked summaries only.csv (413 rows)





In [None]:
# -------------------------------
# 🔧 Configuration
# -------------------------------
API_KEY = "my_key2"
API_URL = "https://api.anthropic.com/v1/messages"
HEADERS = {
    "x-api-key": API_KEY,
    "anthropic-version": "2023-06-01",
    "content-type": "application/json"
}

MODEL = "claude-3-7-sonnet-20250219"
INPUT_CSV = "stage 1/Masked Research Idea & Objective Extraction.csv"
OUTPUT_CSV_ALL = "stage 2/Rewritten Masked Extraction.csv"
OUTPUT_CSV_CLEAN = "stage 2/leakage free summaries only.csv"
OUTPUT_CSV_LEAKED = "stage 2/leaked summaries only.csv"
SLEEP_BETWEEN_REQUESTS = 1

# -------------------------------
# 🧠 AI/ML/DL Vocabulary List
# -------------------------------
term_list = [
    "Support Vector Machine", "Decision Tree", "Random Forest", "Naive Bayes",
    "K-Nearest Neighbors", "Logistic Regression", "Linear Regression", "Gradient Boosting",
    "XGBoost", "LightGBM", "CatBoost", "AdaBoost", "K-Means", "DBSCAN",
    "Hierarchical Clustering", "PCA", "LDA", "Q-Learning", "DQN",
    "GAN", "VAE", "CNN", "RNN", "LSTM", "GRU", "Transformer", "BERT",
    "GPT", "T5", "LLaMA", "Mistral", "Claude", "Autoencoder", "MLP",
    "ResNet", "Inception", "VGG", "EfficientNet", "YOLO", "SSD", "Faster R-CNN",
    "Deep Reinforcement Learning", "Meta-learning", "Few-shot Learning", "Zero-shot Learning",
    "Federated Learning", "Self-supervised Learning", "Semi-supervised Learning",
    "Supervised Learning", "Unsupervised Learning", "Transfer Learning", "Representation Learning",
    "learning algorithm", "computational model", "prediction model", "automated method",
    "data-driven technique", "neural approach", "intelligent system", "adaptive model",
    "classification technique", "regression approach", "optimization-based method",
    "probabilistic framework", "generative model", "discriminative model", "hybrid system",
    "data analytics", "pattern recognition technique", "predictive pipeline",
    "algorithmic solution", "high-capacity model", "state-of-the-art method",
    "novel approach", "ensemble method", "custom architecture", "algorithmic strategy",
    "smart system", "scalable framework", "dynamic learning scheme", "decision-making model",
    "analytical model", "computational strategy",
    "advanced computing", "intelligent analysis", "automated detection",
    "computational intelligence", "smart detection", "enhanced recognition",
    "autonomous system", "technology-driven approach", "model-based analysis",
    "hybrid learning", "deep representation", "intelligent estimation",
    "self-learning mechanism", "automatic classification", "high-performance model",
    "synthetic model generation", "context-aware learning", "domain adaptation",
    "feature extraction technique", "latent variable model",
    "probabilistic inference", "statistical modeling", "parameter tuning",
    "optimization routine", "hyperparameter selection", "loss minimization",
    "backpropagation", "likelihood estimation", "training and testing phase",
    "evaluation metrics", "convergence behavior", "stochastic process",
    "sampling techniques", "cross-validation", "gradient descent",
    "regularization method", "overfitting prevention", "feature engineering",
    "dimensionality reduction", "noise reduction technique"
]

# -------------------------------
# 🧠 Rewrite Prompt Template
# -------------------------------
def make_rewrite_prompt(summary, term_list):
    terms = ", ".join(term_list)
    return f"""
You are reviewing an extracted summary describing the research idea and objective of a paper. The summary should be free of references related to Artificial Intelligence (AI), Machine Learning (ML), Deep Learning (DL), or any related computational terms/phrases.

These references can be direct, vague, euphemisms, or synonyms terms/phrases such as: {terms}

Your task:
1. Go sentence by sentence.
2. If a sentence contains any such reference, rewrite it to remove or neutralize the term while keeping the original research idea or objective intact.
3. If a sentence is already clean of any Artificial Intelligence (AI), Machine Learning (ML), Deep Learning (DL) language, keep it unchanged.
4. At the end, return a clean paragraph that captures the same research idea and objective without any AI/ML/DL-related language.

Only return the final cleaned paragraph. Do not explain your changes.

Here is the summary:
{summary}
"""

# -------------------------------
# 🔁 Main Processing Loop
# -------------------------------
def detect_and_rewrite(input_csv, output_csv_all, output_csv_clean, output_csv_leaked):
    df = pd.read_csv(input_csv)
    results = []

    for idx, row in df.iterrows():
        abstract = str(row["abstract"])
        methods = str(row["verified methods list"])
        masked_extraction = str(row["masked extraction"])

        try:
            print(f"🔍 [{idx}] Rewriting if needed...")

            prompt = make_rewrite_prompt(masked_extraction, term_list)
            # Claude API call
            payload = {
                "model": MODEL,
                "max_tokens": 1024,
                "messages": [{"role": "user", "content": prompt}]
            }

            response = requests.post(API_URL, headers=HEADERS, json=payload)
            response.raise_for_status()
            cleaned_summary = response.json()["content"][0]["text"].strip()

            # Detect if any rewrite occurred
            leakage_detected = masked_extraction.strip() != cleaned_summary

            results.append({
                "domain": row.get("domain", ""),
                "title": row.get("title", ""),
                "abstract": abstract,
                "verified methods list": methods,
                "inital masked extraction": masked_extraction,
                "final masked extraction": cleaned_summary,
                "leakage detected": leakage_detected
            })

            time.sleep(SLEEP_BETWEEN_REQUESTS)

        except Exception as e:
            print(f"❌ Error on row {idx}: {e}")
            continue

    # Save all results
    df_out = pd.DataFrame(results)
    df_out.to_csv(output_csv_all, index=False)
    print(f"\n✅ All results saved to: {output_csv_all}")

    # Save clean summaries
    clean_df = df_out[df_out["leakage detected"] == False]
    clean_df.to_csv(output_csv_clean, index=False)
    print(f"✅ Clean summaries saved to: {output_csv_clean} ({len(clean_df)} rows)")

    # Save rewritten summaries
    leaked_df = df_out[df_out["leakage detected"] == True]
    leaked_df.to_csv(output_csv_leaked, index=False)
    print(f"🚨 Rewritten (previously leaked) summaries saved to: {output_csv_leaked} ({len(leaked_df)} rows)")

In [19]:
# -------------------------------
# 🎬 Run
# -------------------------------
if __name__ == "__main__":
    detect_and_rewrite(INPUT_CSV, OUTPUT_CSV_ALL, OUTPUT_CSV_CLEAN, OUTPUT_CSV_LEAKED)

🔍 [0] Rewriting if needed...

---------------------

You are reviewing an extracted summary describing the research idea and objective of a paper. The summary should be free of references related to Artificial Intelligence (AI), Machine Learning (ML), Deep Learning (DL), or any related computational terms/phrases.

These references can be direct, vague, euphemisms, or synonyms terms/phrases such as: Support Vector Machine, Decision Tree, Random Forest, Naive Bayes, K-Nearest Neighbors, Logistic Regression, Linear Regression, Gradient Boosting, XGBoost, LightGBM, CatBoost, AdaBoost, K-Means, DBSCAN, Hierarchical Clustering, PCA, LDA, Q-Learning, DQN, GAN, VAE, CNN, RNN, LSTM, GRU, Transformer, BERT, GPT, T5, LLaMA, Mistral, Claude, Autoencoder, MLP, ResNet, Inception, VGG, EfficientNet, YOLO, SSD, Faster R-CNN, Deep Reinforcement Learning, Meta-learning, Few-shot Learning, Zero-shot Learning, Federated Learning, Self-supervised Learning, Semi-supervised Learning, Supervised Learning, Un