<a href="https://colab.research.google.com/github/ykitaguchi77/AI-Article-Writer/blob/main/Pubmed_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**PubMed論文分類**

1. PubMedで検索したものをsave (all results, abstract<text>)してダウンロードする

2. article_list.txtとリネームしてcontent配下にアップロードする

In [1]:
import re
import csv
from typing import List, Dict
import pandas as pd

def split_into_items(text: str) -> List[str]:
    """Split the text into items based on double newlines."""
    return [item.strip() for item in re.split(r'\n\n+', text) if item.strip()]

def clean_authors(authors: str) -> str:
    """Remove numbered brackets from the authors string."""
    return re.sub(r'\(\d+\)', '', authors).replace('  ', ' ').strip()

def extract_article_info(items: List[str]) -> Dict[str, str]:
    """Extract relevant information from a list of items."""
    info = {
        'Title': '',
        'Authors': '',
        'DOI': '',
        'PMID': '',
        'Abstract': ''
    }

    abstract_keywords = r'PURPOSE|BACKGROUND|OBJECTIVE|ABSTRACT|OBJECTIVES|case|CONCLUSION'

    for i, item in enumerate(items):
        if re.search(r'\(\d+\)[,.]', item):
            info['Authors'] = clean_authors(item)
            if i > 0:
                info['Title'] = items[i-1]
        elif re.match(abstract_keywords, item, re.IGNORECASE) and item != info['Title']:
            info['Abstract'] = item

    # Extract DOI and PMID from the full text
    full_text = '\n'.join(items)
    doi_match = re.search(r'DOI: (.*?)(?:\n|$)', full_text)
    if doi_match:
        info['DOI'] = doi_match.group(1).strip()

    pmid_match = re.search(r'PMID: (\d+)', full_text)
    if pmid_match:
        info['PMID'] = pmid_match.group(1).strip()

    return info

def parse_pubmed_text(text: str) -> List[Dict[str, str]]:
    """Parse the entire PubMed text and extract information for each article."""
    articles = re.split(r'\n\n\d+\.', text)[1:]  # Split the text into individual articles
    return [extract_article_info(split_into_items(article.strip())) for article in articles]

def write_to_csv(df: pd.DataFrame, filename: str):
    """Write the DataFrame to a CSV file."""
    df.to_csv(filename, index=False, encoding='utf-8')

# Main execution
if __name__ == "__main__":
    # Read the input file
    with open('article_list.txt', 'r', encoding='utf-8') as file:
        text = file.read()

    # Parse the text and extract article information
    articles = parse_pubmed_text(text)

    # Create a DataFrame
    df = pd.DataFrame(articles)

    # # Display the first few rows of the DataFrame
    # print(df.head())

    # # Display basic information about the DataFrame
    # print(df.info())

    # Write the DataFrame to a CSV file
    write_to_csv(df, 'pubmed_articles.csv')

    print(f"CSV file 'pubmed_articles.csv' has been created with {len(articles)} articles.")

    df

FileNotFoundError: [Errno 2] No such file or directory: 'article_list.txt'

#**Seach in PubMed using Biopython**

In [2]:
# 必要なライブラリをインストール
!pip install biopython

# ライブラリをインポート
from Bio import Entrez
import pandas as pd

# Entrezにメールアドレスを設定（必須）
Entrez.email = "your_email@example.com"  # 自分のメールアドレスに変更してください

def search_pubmed(query, max_results=10):
    # PubMedで検索を実行
    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
    record = Entrez.read(handle)
    handle.close()

    # 検索結果からPMIDを取得
    id_list = record["IdList"]

    results = []
    for pmid in id_list:
        # 各論文の詳細情報を取得
        handle = Entrez.efetch(db="pubmed", id=pmid, rettype="medline", retmode="text")
        record = handle.read()
        handle.close()

        # 必要な情報を抽出
        title = next((line for line in record.split('\n') if line.startswith('TI  - ')), 'N/A')[6:]
        authors = next((line for line in record.split('\n') if line.startswith('AU  - ')), 'N/A')[6:]
        journal = next((line for line in record.split('\n') if line.startswith('TA  - ')), 'N/A')[6:]
        pub_date = next((line for line in record.split('\n') if line.startswith('DP  - ')), 'N/A')[6:]

        results.append({
            'PMID': pmid,
            'Title': title,
            'Authors': authors,
            'Journal': journal,
            'Publication Date': pub_date
        })

    return pd.DataFrame(results)

# 検索クエリを設定
search_query = "artificial intelligence in healthcare"  # 検索したいキーワードに変更してください
max_results = 20  # 取得したい結果の最大数

# 検索を実行
results_df = search_pubmed(search_query, max_results)

# 結果を表示
print(results_df)

# 結果をCSVファイルとして保存
results_df.to_csv('pubmed_search_results.csv', index=False)
print("Results saved to 'pubmed_search_results.csv'")

Collecting biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.84
        PMID                                              Title       Authors  \
0   39042446  Roles, Users, Benefits, and Limitations of Cha...    Laymouna M   
1   39042233  Artificial intelligence-derived left ventricul...    Kuwahara A   
2   39041940  Screening Outcomes of Mammography with AI in D...         Ha SM   
3   39041628  Artificial Intelligence Applications in Oral C...       Viet CT   
4   39041503  The utility of artificial intelligence in iden...    Nxumalo ZZ   
5   39041284  Design, construction, and validation of obstet...     Soares FM   
6   39041057  Automated algorithm for medical data structuri...  Nainamalai V   
7   39039992  Time Series AI Model f

In [12]:
!pip install crossrefapi

Collecting crossrefapi
  Downloading crossrefapi-1.6.0-py3-none-any.whl (14 kB)
Collecting urllib3==1.26.16 (from crossrefapi)
  Downloading urllib3-1.26.16-py2.py3-none-any.whl (143 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.1/143.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: urllib3, crossrefapi
  Attempting uninstall: urllib3
    Found existing installation: urllib3 2.0.7
    Uninstalling urllib3-2.0.7:
      Successfully uninstalled urllib3-2.0.7
Successfully installed crossrefapi-1.6.0 urllib3-1.26.16


In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from crossref.restful import Works
import plotly.graph_objs as go
from Bio import Entrez
import networkx as nx

# Entrezの設定（自分のメールアドレスを入力してください）
Entrez.email = "your_email@example.com"

def search_pubmed(query, max_results=100):
    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
    record = Entrez.read(handle)
    return record["IdList"]

def get_doi_from_pmid(pmid):
    handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
    records = Entrez.read(handle)
    try:
        return records['PubmedArticle'][0]['PubmedData']['ArticleIdList'][[x.attributes['IdType'] for x in records['PubmedArticle'][0]['PubmedData']['ArticleIdList']].index('doi')].title()
    except:
        return None

graph_nodes = {}
graph_edges = {}

def create_citation_network(pmids):
    works = Works()
    for pmid in tqdm(pmids):
        doi = get_doi_from_pmid(pmid)
        if doi:
            paper = works.doi(doi)
            if paper is not None:
                graph_nodes[doi] = {
                    "title": paper.get("title", [""])[0],
                    "author": paper.get("author", [{"family": "", "given": ""}])[0]["family"] + ", " + paper.get("author", [{"family": "", "given": ""}])[0]["given"],
                    "citedByCount": paper.get("is-referenced-by-count", 0),
                    "referenceCount": paper.get("reference-count", 0),
                    "inPubMed": True
                }

                if "reference" in paper:
                    for ref in paper["reference"]:
                        if "DOI" in ref:
                            ref_doi = ref["DOI"]
                            if ref_doi not in graph_nodes:
                                ref_paper = works.doi(ref_doi)
                                if ref_paper is not None:
                                    graph_nodes[ref_doi] = {
                                        "title": ref_paper.get("title", [""])[0],
                                        "author": ref_paper.get("author", [{"family": "", "given": ""}])[0]["family"] + ", " + ref_paper.get("author", [{"family": "", "given": ""}])[0]["given"],
                                        "citedByCount": ref_paper.get("is-referenced-by-count", 0),
                                        "referenceCount": ref_paper.get("reference-count", 0),
                                        "inPubMed": False
                                    }

                            if doi not in graph_edges:
                                graph_edges[doi] = []
                            graph_edges[doi].append(ref_doi)

def visualize_graph(is_only_pubmed=False):
    G = nx.Graph()

    impact_factor = []
    impact_size = []
    is_pubmed_color = []

    for key, node in graph_nodes.items():
        if not is_only_pubmed or (is_only_pubmed and node["inPubMed"]):
            G.add_node(key)
            G.nodes[key]["info"] = node
            impact_factor.append(node["citedByCount"])
            impact_size.append(min(node["citedByCount"] + 10, 40))
            is_pubmed_color.append(1 if node["inPubMed"] else 0)

    for key, edges in graph_edges.items():
        if not is_only_pubmed or (is_only_pubmed and graph_nodes[key]["inPubMed"]):
            for edge in edges:
                if edge in graph_nodes:
                    G.add_edge(key, edge)

    pos = nx.spring_layout(G, k=0.3, seed=1)
    for node in G.nodes():
        G.nodes[node]["pos"] = pos[node]

    edge_x, edge_y = [], []
    for edge in G.edges():
        x0, y0 = G.nodes[edge[0]]['pos']
        x1, y1 = G.nodes[edge[1]]['pos']
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])

    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=0.5, color='#888'),
        mode='lines')

    node_x, node_y = [], []
    for node in G.nodes():
        x, y = G.nodes[node]['pos']
        node_x.append(x)
        node_y.append(y)

    node_info = []
    for n in G.nodes():
        info = G.nodes[n]["info"]
        node_info.append(f"Title: {info['title']}<br>Author: {info['author']}<br>Cited by: {info['citedByCount']}<br>References: {info['referenceCount']}<br>In PubMed: {info['inPubMed']}")

    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers',
        hoverinfo='text',
        text=node_info,
        marker=dict(
            showscale=True,
            colorscale='YlGnBu',
            reversescale=False,
            color=is_pubmed_color,
            size=impact_size,
            colorbar=dict(
                thickness=15,
                title='1: in PubMed, 0: not in PubMed',
                xanchor='left',
                titleside='right'
            ),
            line_width=2))

    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(
                        title='PubMed Citation Network',
                        titlefont_size=16,
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=20,l=5,r=5,t=40),
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))

    fig.update_layout(height=1000)
    fig.show()

def main():
    query = input("PubMedで検索するキーワードを入力してください: ")
    max_results = int(input("取得する最大論文数を入力してください: "))

    print("PubMedを検索中...")
    pmids = search_pubmed(query, max_results)

    print(f"{len(pmids)}件の論文が見つかりました。引用ネットワークを作成中...")
    create_citation_network(pmids)

    print("ネットワークを可視化中...")
    visualize_graph()

if __name__ == "__main__":
    main()

PubMedで検索するキーワードを入力してください: blepharoplasty
取得する最大論文数を入力してください: 3
PubMedを検索中...
3件の論文が見つかりました。引用ネットワークを作成中...


100%|██████████| 3/3 [00:21<00:00,  7.23s/it]

ネットワークを可視化中...



