<a href="https://colab.research.google.com/github/ykitaguchi77/AI-Article-Writer/blob/main/Pubmed_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**PubMed論文分類**

1. PubMedで検索したものをsave (all results, abstract<text>)してダウンロードする

2. article_list.txtとリネームしてcontent配下にアップロードする

In [13]:
import re
import csv
from typing import List, Dict

def split_into_items(text: str) -> List[str]:
    """Split the text into items based on double newlines."""
    return [item.strip() for item in re.split(r'\n\n+', text) if item.strip()]

def clean_authors(authors: str) -> str:
    """Remove numbered brackets from the authors string."""
    return re.sub(r'\(\d+\)', '', authors).replace('  ', ' ').strip()

def extract_article_info(items: List[str]) -> Dict[str, str]:
    """Extract relevant information from a list of items."""
    info = {
        'Title': '',
        'Authors': '',
        'DOI': '',
        'PMID': '',
        'Abstract': ''
    }

    abstract_keywords = r'PURPOSE|BACKGROUND|OBJECTIVE|ABSTRACT|OBJECTIVES|case|CONCLUSION'

    for i, item in enumerate(items):
        if re.search(r'\(\d+\)[,.]', item):
            info['Authors'] = clean_authors(item)
            if i > 1:
                info['Title'] = items[i-1]
        elif re.match(abstract_keywords, item, re.IGNORECASE) and item != info['Title']:
            info['Abstract'] = item

    # Extract DOI and PMID from the full text
    full_text = '\n'.join(items)
    doi_match = re.search(r'DOI: (.*?)(?:\n|$)', full_text)
    if doi_match:
        info['DOI'] = doi_match.group(1).strip()

    pmid_match = re.search(r'PMID: (\d+)', full_text)
    if pmid_match:
        info['PMID'] = pmid_match.group(1).strip()

    return info

def parse_pubmed_text(text: str) -> List[Dict[str, str]]:
    """Parse the entire PubMed text and extract information for each article."""
    articles = re.split(r'\n\n\d+\.', text)[1:]  # Split the text into individual articles
    return [extract_article_info(split_into_items(article.strip())) for article in articles]

def write_to_csv(articles: List[Dict[str, str]], filename: str):
    """Write the extracted information to a CSV file."""
    fieldnames = ['Title', 'Authors', 'DOI', 'PMID', 'Abstract']

    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for article in articles:
            writer.writerow(article)

# Main execution
if __name__ == "__main__":
    # Read the input file
    with open('article_list.txt', 'r', encoding='utf-8') as file:
        text = file.read()

    # Parse the text and extract article information
    articles = parse_pubmed_text(text)

    # Write the extracted information to a CSV file
    write_to_csv(articles, 'pubmed_articles.csv')

    print(f"CSV file 'pubmed_articles.csv' has been created with {len(articles)} articles.")

CSV file 'pubmed_articles.csv' has been created with 7064 articles.
