# SEMANTIC SCHOLAR SEARCH

In [59]:
import requests
import json

# Specify the search term
query = '"WHO/HAI"'

# Define the API endpoint URL
url = "http://api.semanticscholar.org/graph/v1/paper/search/bulk"

# Define the query parameters
query_params = {
    "query": query,
    "limit": 1000,  # Adjust this as needed
    "offset": 0,  # Start from the beginning
    "fields": "paperId,corpusId,externalIds,title,abstract,authors,year,publicationTypes,publicationDate,venue,fieldsOfStudy,isOpenAccess,openAccessPdf,url",
    "year": "1999-"  # Filters papers published in 1999 and onward
}

# Send the API request
response = requests.get(url, params=query_params)

# Check the response status
if response.status_code == 200:
    response_data = response.json()

    # Print the raw response data to check its structure
    print(f"Response Content: {response_data}")

    # Initialize the retrieved counter
    retrieved = 0

    # Write results to a JSON file and handle pagination
    with open("papers1.json", "a") as file:
        while True:
            if "data" in response_data:
                retrieved += len(response_data["data"])
                print(f"Retrieved {retrieved} papers...")

                # Save the papers to file
                for paper in response_data["data"]:
                    json.dump(paper, file, indent=2)
                    file.write("\n")

            # Check for the continuation token to fetch more data
            if "token" not in response_data:
                break  # No more data to fetch

            # Fetch the next page using the continuation token
            response_data = requests.get(f"{url}?token={response_data['token']}").json()

    print(f"Done! Retrieved {retrieved} papers total")

else:
    print(f"Request failed with status code {response.status_code}")


Retrieved 123 papers...
Done! Retrieved 123 papers total


# LOADING DATABASE

In [28]:
from pymongo import MongoClient

# Replace the connection string with your own
client = MongoClient("mongodb+srv://devsiweb:eLvWrjKtsrnJcNBD@whohaitest.oka8zkf.mongodb.net/")

# Access your database and collection
db = client["WHO-HAI_Papers"]  # Replace 'myDatabase' with the name of your database
collection = db["Surveys"]  # Replace 'papers' with the name of your collection


In [25]:
papers = collection.find({})  # Empty filter to get all papers

for paper in papers:
    print(paper)

{'_id': ObjectId('680f74376755013e8ac50c7c'), 'paperId': '13ee1cbd8bcf1ef887b2d1547ebb5b2d1406b2dd', 'url': 'https://www.semanticscholar.org/paper/13ee1cbd8bcf1ef887b2d1547ebb5b2d1406b2dd', 'title': 'Monitoring the impact of regulatory measures on medicine pricing in Thailand: an observation over a 16-year span', 'openAccessPdf': {'url': '', 'status': None, 'license': 'CCBYNC'}, 'publicationTypes': ['JournalArticle', 'Review'], 'publicationDate': '2025-02-24'}
{'_id': ObjectId('680f74376755013e8ac50c7d'), 'paperId': '1ce57a4b3320081b1b577b01ed72803137e94814', 'url': 'https://www.semanticscholar.org/paper/1ce57a4b3320081b1b577b01ed72803137e94814', 'title': 'Affordability of Paediatric Oral Anti-Infective Medicines in a Selected District, Sri Lanka', 'openAccessPdf': {'url': 'https://www.mdpi.com/2813-0618/3/1/11/pdf?version=1710252941', 'status': 'GOLD', 'license': 'CCBY'}, 'publicationTypes': ['JournalArticle'], 'publicationDate': '2024-03-12'}
{'_id': ObjectId('680f74376755013e8ac50c7

In [17]:
query = {
    "year": {"$gte": 2023},  # Papers published in or after 2023
    "isOpenAccess": True  # Papers that are open access
}

papers = collection.find(query)

for paper in papers:
    print(paper)

{'_id': ObjectId('680f74376755013e8ac50c9b'), 'paperId': '1ce57a4b3320081b1b577b01ed72803137e94814', 'url': 'https://www.semanticscholar.org/paper/1ce57a4b3320081b1b577b01ed72803137e94814', 'title': 'Affordability of Paediatric Oral Anti-Infective Medicines in a Selected District, Sri Lanka', 'abstract': 'In this cross-sectional descriptive study conducted in the Ratnapura district, Sri Lanka, we assessed the affordability of oral pediatric anti-infective medicines (OPAIMs). Using a modified WHO/HAI medicinal price methodology, we examined the availability, median price ratios (MPRs), mean percentage difference, and affordability of the standard treatment of the originator brand (OB) and lowest-priced generic (LPG) OPAIMs in 30 private and 2 state-owned pharmacies. The study revealed disparities in availability, with only 50% of private pharmacies offering all 11 medicinal drugs in their generic form. The MPRs of OPAIMs for OB and LPG varied, with three drugs exceeding the financially 

In [18]:
papers = collection.find(query).limit(5)  # Get only 5 results

for paper in papers:
    print(paper)


{'_id': ObjectId('680f74376755013e8ac50c9b'), 'paperId': '1ce57a4b3320081b1b577b01ed72803137e94814', 'url': 'https://www.semanticscholar.org/paper/1ce57a4b3320081b1b577b01ed72803137e94814', 'title': 'Affordability of Paediatric Oral Anti-Infective Medicines in a Selected District, Sri Lanka', 'abstract': 'In this cross-sectional descriptive study conducted in the Ratnapura district, Sri Lanka, we assessed the affordability of oral pediatric anti-infective medicines (OPAIMs). Using a modified WHO/HAI medicinal price methodology, we examined the availability, median price ratios (MPRs), mean percentage difference, and affordability of the standard treatment of the originator brand (OB) and lowest-priced generic (LPG) OPAIMs in 30 private and 2 state-owned pharmacies. The study revealed disparities in availability, with only 50% of private pharmacies offering all 11 medicinal drugs in their generic form. The MPRs of OPAIMs for OB and LPG varied, with three drugs exceeding the financially 

In [19]:
papers = collection.find(query).sort("citationCount", -1)  # -1 for descending order

for paper in papers:
    print(paper)


{'_id': ObjectId('680f74376755013e8ac50cc7'), 'paperId': '73b547d96d0f297b473d6ec3dde68a5141dabb5e', 'externalIds': {'PubMedCentral': '11449349', 'DOI': '10.1371/journal.pone.0309350', 'CorpusId': 273099358, 'PubMed': '39361609'}, 'corpusId': 273099358, 'url': 'https://www.semanticscholar.org/paper/73b547d96d0f297b473d6ec3dde68a5141dabb5e', 'title': 'Availability, price and affordability of insulin, delivery devices and self-monitoring blood glucose devices in Indonesia', 'abstract': 'Insulin is essential for the survival of people with type 1 diabetes and for better management of people with type 2 diabetes. People with diabetes using insulin also require self-monitoring blood glucose (SMBG) devices (e.g., meters, strips, continuous monitoring systems) for day-to-day management. It is essential to ensure that insulin and these devices are available and affordable. This study aimed to evaluate the availability, price, and affordability of insulin and SMBG devices in Indonesia using an 

In [20]:
pipeline = [
    {"$group": {"_id": "$venue", "count": {"$sum": 1}}},  # Group by venue and count papers
    {"$sort": {"count": -1}}  # Sort by count in descending order
]

result = collection.aggregate(pipeline)

for venue in result:
    print(venue)


{'_id': None, 'count': 30}
{'_id': 'PLoS ONE', 'count': 6}
{'_id': 'BMC Health Services Research', 'count': 4}
{'_id': 'Global Health Research and Policy', 'count': 4}
{'_id': 'Journal of Pharmaceutical Policy and Practice', 'count': 4}
{'_id': 'Frontiers in Public Health', 'count': 4}
{'_id': 'Risk Management and Healthcare Policy', 'count': 2}
{'_id': 'Scientific Reports', 'count': 2}
{'_id': 'International Journal for Equity in Health', 'count': 2}
{'_id': 'medRxiv', 'count': 2}
{'_id': 'Indian Journal of Ophthalmology', 'count': 2}
{'_id': 'Annals of the New York Academy of Sciences', 'count': 2}
{'_id': 'Ghana Pharmaceutical Journal', 'count': 2}
{'_id': 'Journal of Chinese Pharmaceutical Sciences', 'count': 2}
{'_id': 'BMC Pulmonary Medicine', 'count': 2}
{'_id': 'Journal of Clinical Pharmacy and Therapeutics', 'count': 2}
{'_id': 'BioMed Research International', 'count': 2}
{'_id': '', 'count': 2}
{'_id': 'FARMAKOEKONOMIKA. Modern Pharmacoeconomics and Pharmacoepidemiology', 'co

In [21]:
query = {"paperId": "649def34f8be52c8b66281af98ae884c09aef38b"}  # The paper you're updating
new_values = {"$set": {"reviewed": True}}  # Set the 'reviewed' field to True

collection.update_one(query, new_values)


UpdateResult({'n': 0, 'electionId': ObjectId('7fffffff00000000000000b6'), 'opTime': {'ts': Timestamp(1745843673, 5), 't': 182}, 'nModified': 0, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1745843673, 5), 'signature': {'hash': b'X\xe6\x84n2\xb7\xf2AoD\x80\xc0\x11%\xfb\x07\xdf\xdap\xd2', 'keyId': 7463129830037389313}}, 'operationTime': Timestamp(1745843673, 5), 'updatedExisting': False}, acknowledged=True)

In [22]:
query = {"paperId": "649def34f8be52c8b66281af98ae884c09aef38b"}  # The paper you're updating
new_values = {"$set": {"reviewed": True}}  # Set the 'reviewed' field to True

collection.update_one(query, new_values)


UpdateResult({'n': 0, 'electionId': ObjectId('7fffffff00000000000000b6'), 'opTime': {'ts': Timestamp(1745843685, 14), 't': 182}, 'nModified': 0, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1745843685, 14), 'signature': {'hash': b'u\xa7ES\xbf\xe9\x9a.,\xe8\xe2c\xaaJ\xac%&\xe6\x92V', 'keyId': 7463129830037389313}}, 'operationTime': Timestamp(1745843685, 14), 'updatedExisting': False}, acknowledged=True)

In [26]:
import json

papers = collection.find({})  # Or use your query
with open('papers_mongoexport.json', 'w') as file:
    json.dump([paper for paper in papers], file, indent=2)


TypeError: Object of type ObjectId is not JSON serializable

In [27]:
papers = collection.find({}, {"title": 1, "authors": 1})  # Only retrieve the title and authors fields

for paper in papers:
    print(paper)


{'_id': ObjectId('680f74376755013e8ac50c7c'), 'title': 'Monitoring the impact of regulatory measures on medicine pricing in Thailand: an observation over a 16-year span'}
{'_id': ObjectId('680f74376755013e8ac50c7d'), 'title': 'Affordability of Paediatric Oral Anti-Infective Medicines in a Selected District, Sri Lanka'}
{'_id': ObjectId('680f74376755013e8ac50c7e'), 'title': 'Analysis of the availability of bronchodilators and anti-inflammatory drugs for patients with chronic obstructive pulmonary disease'}
{'_id': ObjectId('680f74376755013e8ac50c7f'), 'title': 'Evaluating the affordability of asthma, chronic obstructive pulmonary disease, and cystic fibrosis medicines in a middle-income country'}
{'_id': ObjectId('680f74376755013e8ac50c80'), 'title': 'Availability and affordability of osteoporosis treatment drugs in Wuhan based on the WHO/HAI standard survey method'}
{'_id': ObjectId('680f74376755013e8ac50c81'), 'title': 'Access to essential medicines used in the management of noncommun

In [60]:
import pandas as pd
import json
from fuzzywuzzy import fuzz  # Using fuzzywuzzy for fuzzy string matching
import re  # For regex pattern matching

# Load the country-region mapping Excel file
mapping_file = 'data_export_REFMART_REF_COUNTRY.xlsx'
df_mapping = pd.read_excel(mapping_file)

# Create a normalized dictionary for country name variants to official info
country_info_mapping = {}

# Normalize function
def normalize_country_name(name):
    if isinstance(name, str):
        return name.strip().lower()
    return None

# Build the mapping dictionary with all variants pointing to official info
for idx, row in df_mapping.iterrows():
    short_name = row['NAME_SHORT_EN']
    who_region = row['GRP_WHO_REGION']
    wb_income = row['GRP_WB_INCOME']
    
    # Get all potential name variants for this country
    variants = [
        row['NAME_laymen'], row['NAME_SHORT_EN'], row['NAME_FORMAL_EN'],
        row['NAME_SHORT_AR'], row['NAME_FORMAL_AR'], row['NAME_SHORT_ES'],
        row['NAME_FORMAL_ES'], row['NAME_SHORT_FR'], row['NAME_FORMAL_FR'],
        row['NAME_SHORT_RU'], row['NAME_FORMAL_RU'], row['NAME_SHORT_ZH'],
        row['NAME_FORMAL_ZH']
    ]
    
    # Add each variant to the mapping dictionary
    for variant in variants:
        normalized_variant = normalize_country_name(variant)
        if normalized_variant:
            country_info_mapping[normalized_variant] = {
                'official_name': short_name,
                'who_region': who_region,
                'wb_income': wb_income
            }

# Add specific city/region to country mappings
city_to_country = {
    'wuhan': 'China',
    'south wollo': 'Ethiopia',
    'addis ababa': 'Ethiopia',
    'gondar': 'Ethiopia',
    'gamo zone': 'Ethiopia',
    'arba minch': 'Ethiopia',
    'central ethiopia': 'Ethiopia',
    'shandong': 'China',
    'shaanxi': 'China',
    'juba': 'South Sudan',
    'korle-bu': 'Ghana',
    'haryana': 'India',
    'ratnapura': 'Sri Lanka',
    'tororo': 'Uganda',
    'apac': 'Uganda',
    'kabale': 'Uganda',
    'mbarara': 'Uganda'
}

# Add these city mappings to the country info mapping
for city, country in city_to_country.items():
    # Find the country info from our existing mapping
    country_normalized = normalize_country_name(country)
    if country_normalized in country_info_mapping:
        country_info = country_info_mapping[country_normalized]
        country_info_mapping[city] = country_info

# Load and parse JSON file
with open('papers1.json', 'r') as f:
    file_content = f.read()

# Clean and load JSON data
file_content_cleaned = "[" + file_content.replace("}\n{", "},{") + "]"
papers = json.loads(file_content_cleaned)

# Convert papers data into a DataFrame
papers_df = pd.DataFrame(papers)

# Ensure text fields are strings
papers_df['abstract'] = papers_df['abstract'].fillna("").astype(str)
papers_df['title'] = papers_df['title'].fillna("").astype(str)

# Function to extract country from text using fuzzy matching
def extract_countries_with_info(text):
    if not text.strip():
        return [{
            'extracted': "Unknown",
            'official_name': "Unknown",
            'who_region': "Unknown",
            'wb_income': "Unknown"
        }]
    
    found_countries = []
    seen_official_names = set()  # To avoid duplicates
    
    text_lower = text.lower()
    
    # Special handling for South Sudan vs Sudan
    # Check for "South Sudan" first
    if re.search(r'\bsouth\s+sudan\b', text_lower):
        # If "South Sudan" is found, use its info
        if 'south sudan' in country_info_mapping:
            info = country_info_mapping['south sudan']
            found_countries.append({
                'extracted': 'south sudan',
                'official_name': info['official_name'],
                'who_region': info['who_region'],
                'wb_income': info['wb_income']
            })
            seen_official_names.add(info['official_name'])
    
    # Then check for "Sudan" that's not part of "South Sudan"
    elif re.search(r'\bsudan\b', text_lower) and not re.search(r'\bsouth\s+sudan\b', text_lower):
        if 'sudan' in country_info_mapping:
            info = country_info_mapping['sudan']
            found_countries.append({
                'extracted': 'sudan',
                'official_name': info['official_name'],
                'who_region': info['who_region'],
                'wb_income': info['wb_income']
            })
            seen_official_names.add(info['official_name'])
    
    # List of location terms to check (exact matches)
    locations_to_check = list(city_to_country.keys()) + list(country_info_mapping.keys())
    
    # Check for exact matches
    for location in locations_to_check:
        # Skip Sudan and South Sudan as we've already handled them specially
        if location in ['sudan', 'south sudan']:
            continue
            
        if location in text_lower and location in country_info_mapping:
            info = country_info_mapping[location]
            if info['official_name'] not in seen_official_names:
                found_countries.append({
                    'extracted': location,
                    'official_name': info['official_name'],
                    'who_region': info['who_region'],
                    'wb_income': info['wb_income']
                })
                seen_official_names.add(info['official_name'])
    
    # Try fuzzy matching if no exact matches found
    if not found_countries:
        for country_variant in country_info_mapping:
            # Skip Sudan and South Sudan in fuzzy matching too
            if country_variant in ['sudan', 'south sudan']:
                continue
                
            score = fuzz.partial_ratio(country_variant, text_lower)
            if score > 85:  # Higher threshold for more precision
                info = country_info_mapping[country_variant]
                if info['official_name'] not in seen_official_names:
                    found_countries.append({
                        'extracted': country_variant,
                        'official_name': info['official_name'],
                        'who_region': info['who_region'],
                        'wb_income': info['wb_income']
                    })
                    seen_official_names.add(info['official_name'])
    
    return found_countries if found_countries else [{
        'extracted': "Unknown",
        'official_name': "Unknown",
        'who_region': "Unknown",
        'wb_income': "Unknown"
    }]

# Apply the extraction function to the combined title and abstract
papers_df['country_info'] = papers_df.apply(
    lambda row: extract_countries_with_info(row['title'] + ' ' + row['abstract']), 
    axis=1
)

# Extract the individual components into separate columns
papers_df['extracted_country'] = papers_df['country_info'].apply(
    lambda info_list: [item['extracted'] for item in info_list]
)
papers_df['official_country_name'] = papers_df['country_info'].apply(
    lambda info_list: [item['official_name'] for item in info_list]
)
papers_df['WHO_region'] = papers_df['country_info'].apply(
    lambda info_list: [item['who_region'] for item in info_list]
)
papers_df['WB_income_group'] = papers_df['country_info'].apply(
    lambda info_list: [item['wb_income'] for item in info_list]
)

# Display the results
result_df = papers_df[['title', 'abstract', 'extracted_country', 'official_country_name', 'WHO_region', 'WB_income_group']]
print(result_df.head(10))

# Save to CSV if needed
result_df.to_csv('papers1_with_country_info.csv', index=False)

                                                                                                                                                       title  \
0                                                  WHO key access antibiotics price, availability and affordability in private sector pharmacies in Pakistan   
1                                                                      Access and Affordability of Medicines in Malaysia: Need for a National Pricing Policy   
2                                                                    Price, Availability and Affordability of Anti-Cancer Medicines in Addis Ababa, Ethiopia   
3   Evaluating the impact of the single exit price policy on a basket of originator medicines in South Africa from 1999 to 2014 using a time series analysis   
4                               Availability, Prices and Affordability of Selected Essential Cancer Medicines in a Middle-Income Country: The Case of Mexico   
5                                       

In [62]:
import pandas as pd
import json
import re
from fuzzywuzzy import fuzz

# Load the country-region mapping Excel file
mapping_file = 'data_export_REFMART_REF_COUNTRY.xlsx'
df_mapping = pd.read_excel(mapping_file)

# Create a normalized dictionary for country name variants to official info
country_info_mapping = {}

# Normalize function
def normalize_country_name(name):
    if isinstance(name, str):
        return name.strip().lower()
    return None

# Build the mapping dictionary with all variants pointing to official info
for idx, row in df_mapping.iterrows():
    short_name = row['NAME_SHORT_EN']
    who_region = row['GRP_WHO_REGION']
    wb_income = row['GRP_WB_INCOME']
    
    # Get all potential name variants for this country
    variants = [
        row['NAME_laymen'], row['NAME_SHORT_EN'], row['NAME_FORMAL_EN'],
        row['NAME_SHORT_AR'], row['NAME_FORMAL_AR'], row['NAME_SHORT_ES'],
        row['NAME_FORMAL_ES'], row['NAME_SHORT_FR'], row['NAME_FORMAL_FR'],
        row['NAME_SHORT_RU'], row['NAME_FORMAL_RU'], row['NAME_SHORT_ZH'],
        row['NAME_FORMAL_ZH']
    ]
    
    # Add each variant to the mapping dictionary
    for variant in variants:
        normalized_variant = normalize_country_name(variant)
        if normalized_variant:
            country_info_mapping[normalized_variant] = {
                'official_name': short_name,
                'who_region': who_region,
                'wb_income': wb_income
            }

# Add specific city/region to country mappings
city_to_country = {
    'wuhan': 'China',
    'south wollo': 'Ethiopia',
    'addis ababa': 'Ethiopia',
    'gondar': 'Ethiopia',
    'gamo zone': 'Ethiopia',
    'arba minch': 'Ethiopia',
    'central ethiopia': 'Ethiopia',
    'southern ethiopia': 'Ethiopia',
    'northeast ethiopia': 'Ethiopia',
    'northwest ethiopia': 'Ethiopia',
    'shandong': 'China',
    'shaanxi': 'China',
    'juba county': 'South Sudan',
    'juba': 'South Sudan',
    'korle-bu': 'Ghana',
    'haryana': 'India',
    'ratnapura': 'Sri Lanka',
    'tororo': 'Uganda',
    'apac': 'Uganda',
    'kabale': 'Uganda',
    'mbarara': 'Uganda'
}

# Add these city mappings to the country info mapping
for city, country in city_to_country.items():
    country_normalized = normalize_country_name(country)
    if country_normalized in country_info_mapping:
        country_info = country_info_mapping[country_normalized]
        country_info_mapping[city] = country_info

# Load and parse JSON file
with open('papers1.json', 'r') as f:
    file_content = f.read()

# Clean and load JSON data
file_content_cleaned = "[" + file_content.replace("}\n{", "},{") + "]"
papers = json.loads(file_content_cleaned)

# Function to extract all countries from text with their info
def extract_all_countries_with_info(text):
    if not text or not text.strip():
        return [{
            'extracted': "Unknown",
            'official_name': "Unknown",
            'who_region': "Unknown",
            'wb_income': "Unknown"
        }]
    
    found_countries = []
    seen_official_names = set()  # To avoid duplicates
    
    text_lower = text.lower()
    
    # Special handling for South Sudan vs Sudan
    if re.search(r'\bsouth\s+sudan\b', text_lower):
        if 'south sudan' in country_info_mapping:
            info = country_info_mapping['south sudan']
            found_countries.append({
                'extracted': 'south sudan',
                'official_name': info['official_name'],
                'who_region': info['who_region'],
                'wb_income': info['wb_income']
            })
            seen_official_names.add(info['official_name'])
    
    elif re.search(r'\bsudan\b', text_lower) and not re.search(r'\bsouth\s+sudan\b', text_lower):
        if 'sudan' in country_info_mapping:
            info = country_info_mapping['sudan']
            found_countries.append({
                'extracted': 'sudan',
                'official_name': info['official_name'],
                'who_region': info['who_region'],
                'wb_income': info['wb_income']
            })
            seen_official_names.add(info['official_name'])
    
    # Check for exact matches
    locations_to_check = list(city_to_country.keys()) + list(country_info_mapping.keys())
    
    for location in locations_to_check:
        # Skip Sudan and South Sudan as we've already handled them specially
        if location in ['sudan', 'south sudan']:
            continue
            
        if location in text_lower and location in country_info_mapping:
            info = country_info_mapping[location]
            if info['official_name'] not in seen_official_names:
                found_countries.append({
                    'extracted': location,
                    'official_name': info['official_name'],
                    'who_region': info['who_region'],
                    'wb_income': info['wb_income']
                })
                seen_official_names.add(info['official_name'])
    
    # If no countries found with exact matches, try fuzzy matching
    if not found_countries:
        for country_variant in country_info_mapping:
            # Skip Sudan and South Sudan in fuzzy matching too
            if country_variant in ['sudan', 'south sudan']:
                continue
                
            score = fuzz.partial_ratio(country_variant, text_lower)
            if score > 85:  # Higher threshold for more precision
                info = country_info_mapping[country_variant]
                if info['official_name'] not in seen_official_names:
                    found_countries.append({
                        'extracted': country_variant,
                        'official_name': info['official_name'],
                        'who_region': info['who_region'],
                        'wb_income': info['wb_income']
                    })
                    seen_official_names.add(info['official_name'])
    
    return found_countries if found_countries else [{
        'extracted': "Unknown",
        'official_name': "Unknown",
        'who_region': "Unknown",
        'wb_income': "Unknown"
    }]

# Process each paper
for paper in papers:
    title = paper.get('title', '')
    abstract = paper.get('abstract', '')
    combined_text = f"{title} {abstract}"
    
    # Extract all countries with their info
    country_info_list = extract_all_countries_with_info(combined_text)
    
    # Extract the individual components as lists
    extracted_countries = [item['extracted'] for item in country_info_list]
    official_names = [item['official_name'] for item in country_info_list]
    who_regions = [item['who_region'] for item in country_info_list]
    wb_income_groups = [item['wb_income'] for item in country_info_list]
    
    # Add the extracted information to the paper dictionary
    paper['extracted_country'] = extracted_countries
    paper['official_country_name'] = official_names
    paper['WHO_region'] = who_regions
    paper['WB_income_group'] = wb_income_groups

# Write the updated papers back to a JSON file
with open('papers1_with_country_info.json', 'w') as f:
    # Use ensure_ascii=False to properly handle Unicode characters
    json.dump(papers, f, indent=2, ensure_ascii=False)

# Convert to DataFrame and save as CSV for easier viewing
papers_df = pd.DataFrame(papers)
papers_df.to_csv('papers1_with_country_info.csv', index=False)

print(f"Processed {len(papers)} papers with country information")
print("Sample of extracted country information:")
for i, paper in enumerate(papers[:5]):  # Show first 5 papers
    print(f"\nPaper {i+1}: {paper['title']}")
    print(f"Extracted countries: {paper['extracted_country']}")
    print(f"Official names: {paper['official_country_name']}")
    print(f"WHO regions: {paper['WHO_region']}")
    print(f"WB income groups: {paper['WB_income_group']}")

Processed 246 papers with country information
Sample of extracted country information:

Paper 1: WHO key access antibiotics price, availability and affordability in private sector pharmacies in Pakistan
Extracted countries: ['pakistan']
Official names: ['Pakistan']
WHO regions: ['EMR']
WB income groups: ['LMC']

Paper 2: Access and Affordability of Medicines in Malaysia: Need for a National Pricing Policy
Extracted countries: ['malaysia']
Official names: ['Malaysia']
WHO regions: ['WPR']
WB income groups: ['UMC']

Paper 3: Price, Availability and Affordability of Anti-Cancer Medicines in Addis Ababa, Ethiopia
Extracted countries: ['addis ababa']
Official names: ['Ethiopia']
WHO regions: ['AFR']
WB income groups: ['LIC']

Paper 4: Evaluating the impact of the single exit price policy on a basket of originator medicines in South Africa from 1999 to 2014 using a time series analysis
Extracted countries: ['south africa']
Official names: ['South Africa']
WHO regions: ['AFR']
WB income group

In [64]:
import json

# Load the JSON file
with open('papers1_with_country_info.json', 'r') as f:
    data = json.load(f)

# Remove duplicates based on 'corpusId'
unique_corpus = {}
for paper in data:
    corpus_id = paper.get('corpusId')
    if corpus_id not in unique_corpus:
        unique_corpus[corpus_id] = paper

# Get the deduplicated list
deduplicated_data = list(unique_corpus.values())

# Save the deduplicated JSON
with open('papers1_deduplicated.json', 'w') as f:
    json.dump(deduplicated_data, f, indent=2)

print(f"Finished! {len(data)} -> {len(deduplicated_data)} entries after deduplication.")


Finished! 246 -> 123 entries after deduplication.
