# ICD-10-CM and CUI (Latest Version)

### 2023-06-27

ICD-10-CM codes downloaded from https://hcup-us.ahrq.gov/toolssoftware/ccsr/dxccsr.jsp

In [None]:
# import data
import pandas as pd
icd10cm = pd.read_csv("icd10cm.csv")

In [None]:
# Specify new column names
new_column_names = {'ICD-10-CM CODE': 'ICD10CM', 'ICD-10-CM CODE DESCRIPTION': 
                    'Description'}

# Rename columns in the DataFrame
icd10cm.rename(columns=new_column_names, inplace=True)

#### Drop unnecessary columns:

In [None]:
# Specify columns to keep
columns_to_keep = ['ICD10CM', 'Description']

# Create a new DataFrame with only the desired columns
icd10cm_updated = icd10cm[columns_to_keep]

# Print modified DataFrame
print(icd10cm_updated.head(11))

### Number of ICD-10-CM Codes: 

In [None]:
num_rows = len(icd10cm_updated)
print("Number of ICD-10-CM Codes:", num_rows)

In [None]:
# Remove apostrophes from ICD-10-CM codes
icd10cm_updated['ICD10CM'] = icd10cm_updated['ICD10CM'].str.replace("'", "")

In [None]:
# Print modified DataFrame
print(icd10cm_updated.head(11))

#### Add decimal places to format ICD-10-CM:

In [None]:
def add_decimal(icd_code):
    if len(icd_code) > 3:
        return icd_code[:3] + '.' + icd_code[3:]
    else:
        return icd_code

# Apply add_decimal function to 'ICD10CM' column in dataframe
icd10cm_updated['ICD10CM_WITH_DECIMAL'] = icd10cm_updated['ICD10CM'].apply(add_decimal)

In [None]:
# Create new DataFrame without the 'icd10cm' column
final_icd10cm = icd10cm_updated.drop('ICD10CM', axis=1)

# Verify new DataFrame
print(final_icd10cm.head())

### Sample of resulting CSV file, including CUI: (sample_results_updated.csv)

### From https://documentation.uts.nlm.nih.gov/scripts/crosswalk.py

### 06-27 Update: This code below works (from: https://documentation.uts.nlm.nih.gov/scripts/search-terms.py)

### Final step: Updated using batch size to prevent memory issues

In [None]:
import requests
import pandas as pd
from math import ceil

def retrieve_cui(apikey, version, icd_codes, batch_size):
    uri = "https://uts-ws.nlm.nih.gov"
    content_endpoint = "/rest/search/" + version
    full_url = uri + content_endpoint
    result_list = []

    total_batches = ceil(len(icd_codes) / batch_size)

    for batch in range(total_batches):
        start_index = batch * batch_size
        end_index = (batch + 1) * batch_size
        batch_codes = icd_codes[start_index:end_index]

        try:
            for icd_code in batch_codes:
                page = 0  # Reset page counter for each ICD code
                while True:
                    page += 1
                    query = {'string': icd_code, 'apiKey': apikey, 'pageNumber': page}
                    r = requests.get(full_url, params=query)
                    r.raise_for_status()
                    r.encoding = 'utf-8'
                    outputs = r.json()

                    items = outputs['result']['results']

                    if len(items) == 0:
                        if page == 1:
                            print('No results found for ICD code: ' + icd_code + '\n')
                            break
                        else:
                            break

                    print("Results for ICD code: " + icd_code + ", page " + str(page) + "\n")

                    for result in items:
                        cui = result['ui']
                        print('ICD-10-CM: ' + icd_code)
                        print('CUI: ' + cui)
                        print('\n')

                        result_dict = {
                            'ICD-10-CM': icd_code,
                            'CUI': cui,
                        }
                        result_list.append(result_dict)

                    print('*********')

        except Exception as except_error:
            print(except_error)

    # Convert result_list to DataFrame
    result_df = pd.DataFrame(result_list, columns=['ICD-10-CM', 'CUI'])

    # Save DataFrame to CSV file
    result_df.to_csv('ICD10CM_CUI_FINAL.csv', index=False)
    print("CSV file saved successfully.")

# Set UMLS API key and version
apikey = "49182007-9896-41a5-aae6-f2cb70d0ca5e"
version = "current"

# ICD codes dataframe
icd_codes = final_icd10cm['ICD10CM_WITH_DECIMAL']
batch_size = 200  # Number of codes to process in each batch

# Call retrieve_cui function
retrieve_cui(apikey, version, icd_codes, batch_size)