# SNOMED to CUI

### 2023-07-06

SNOMED codes downloaded from UMLS.

In [None]:
# import data
import pandas as pd
snomed = pd.read_csv("SNOMED.csv")

#### Drop unnecessary columns:

In [None]:
# Specify the columns to keep
column_to_keep = ['id']

# Create a new DataFrame with only the desired columns
snomed_updated = snomed[column_to_keep]

# Print the modified DataFrame
print(snomed_updated.head(11))

### Number of SNOMED: 

In [None]:
# Count the number of rows
num_rows = len(snomed_updated)
print("Number of SNOMED:", num_rows)

### From https://documentation.uts.nlm.nih.gov/scripts/crosswalk.py

### 06-27 Update: This code below works (from: https://documentation.uts.nlm.nih.gov/scripts/search-terms.py)

### This step will process the entire data, using "Description" column to produce CSV file that matches CUI: 
#### (will take a while)

In [None]:
import requests
import pandas as pd
from math import ceil
import csv

def retrieve_cui(apikey, version, snomed_codes, batch_size):
    uri = "https://uts-ws.nlm.nih.gov"
    content_endpoint = "/rest/search/" + version
    full_url = uri + content_endpoint
    result_list = []

    total_batches = ceil(len(snomed_codes) / batch_size)

    for batch in range(total_batches):
        start_index = batch * batch_size
        end_index = (batch + 1) * batch_size
        batch_codes = snomed_codes[start_index:end_index]

        try:
            for snomed_code in batch_codes:
                page = 0  # Reset the page counter for each SNOMED code
                while True:
                    page += 1
                    query = {'string': snomed_code, 'apiKey': apikey, 'pageNumber': page}
                    r = requests.get(full_url, params=query)
                    r.raise_for_status()
                    r.encoding = 'utf-8'
                    outputs = r.json()

                    items = outputs['result']['results']

                    if len(items) == 0:
                        if page == 1:
                            print('No results found for SNOMED code: ' + snomed_code + '\n')
                            break
                        else:
                            break

                    print("Results for SNOMED code: " + snomed_code + ", page " + str(page) + "\n")

                    for result in items:
                        if 'rootSource' in result:
                            cui = result['ui']
                            source_vocab = result['rootSource']
                            print('SNOMED code: ' + snomed_code)
                            print('CUI: ' + cui)
                            print('Source Vocabulary: ' + source_vocab)
                            print('\n')

                            result_dict = {
                                'SNOMED Code': snomed_code,
                                'CUI': cui,
                                'Source Vocabulary': source_vocab
                            }
                            result_list.append(result_dict)

                    print('*********')

        except Exception as except_error:
            print(except_error)

    # Convert the result_list to a DataFrame
    result_df = pd.DataFrame(result_list, columns=['SNOMED Code', 'CUI', 'Source Vocabulary'])

    # Save the DataFrame to a CSV file
    result_df.to_csv('SNOMED_CUI_FINAL_PART2.csv', index=False)
    print("CSV file saved successfully.")

# UMLS API key and version
apikey = "49182007-9896-41a5-aae6-f2cb70d0ca5e"
version = "current"

# Path to CSV file containing SNOMED codes
file_path = r"C:\Users\sungh\SNOMED2.csv"
with open(file_path, 'r') as file:
    reader = csv.reader(file)
    snomed_codes = [row[0] for row in reader]

batch_size = 200  # Number of codes to process in each batch

# Call retrieve_cui function
retrieve_cui(apikey, version, snomed_codes, batch_size)