In [39]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#### **Load the ICD codes dataset**

In [33]:

icd_data = pd.read_excel("../data/icd/icd.xlsx")
display(icd_data.head())

Unnamed: 0,CODE,SHORT DESCRIPTION (VALID ICD-10 FY2025),LONG DESCRIPTION (VALID ICD-10 FY2025),NF EXCL
0,A000,"Cholera due to Vibrio cholerae 01, biovar chol...","Cholera due to Vibrio cholerae 01, biovar chol...",
1,A001,"Cholera due to Vibrio cholerae 01, biovar eltor","Cholera due to Vibrio cholerae 01, biovar eltor",
2,A009,"Cholera, unspecified","Cholera, unspecified",
3,A0100,"Typhoid fever, unspecified","Typhoid fever, unspecified",
4,A0101,Typhoid meningitis,Typhoid meningitis,


#### Display the columns in the dataset

In [13]:
# list the columns
icd_data.columns

Index(['CODE', 'SHORT DESCRIPTION (VALID ICD-10 FY2025)',
       'LONG DESCRIPTION (VALID ICD-10 FY2025)', 'NF EXCL'],
      dtype='object')

#### Display the information about the dataset

In [14]:
icd_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73427 entries, 0 to 73426
Data columns (total 4 columns):
 #   Column                                   Non-Null Count  Dtype 
---  ------                                   --------------  ----- 
 0   CODE                                     73427 non-null  object
 1   SHORT DESCRIPTION (VALID ICD-10 FY2025)  73427 non-null  object
 2   LONG DESCRIPTION (VALID ICD-10 FY2025)   73425 non-null  object
 3   NF EXCL                                  4599 non-null   object
dtypes: object(4)
memory usage: 2.2+ MB


#### Drop the columns `NF EXCL` as it is not relavant

In [18]:
cleaned_df = icd_data.drop("NF EXCL", axis=1)

#### Check for null values in the cleaned dataset

In [21]:
# Check for null values
print("Null values per column:")
print(cleaned_df.isnull().sum())
print("\nTotal null values in dataset:", cleaned_df.isnull().sum().sum())

Null values per column:
CODE                                       0
SHORT DESCRIPTION (VALID ICD-10 FY2025)    0
LONG DESCRIPTION (VALID ICD-10 FY2025)     2
dtype: int64

Total null values in dataset: 2


#### Check for duplicate rows in the cleaned dataset

In [20]:
# Check for duplicate rows
print("Number of duplicate rows:", cleaned_df.duplicated().sum())
print("Total rows in dataset:", len(cleaned_df))

# If there are duplicates, show them
if cleaned_df.duplicated().sum() > 0:
    print("\nDuplicate rows:")
    duplicate_rows = cleaned_df[cleaned_df.duplicated(keep=False)]
    display(duplicate_rows.sort_values(by=cleaned_df.columns[0]))
else:
    print("No duplicate rows found.")

Number of duplicate rows: 0
Total rows in dataset: 73427
No duplicate rows found.


#### Data Quality Summary

In [22]:
# Data quality summary
print("=== DATA QUALITY SUMMARY ===")
print(f"Total rows: {len(cleaned_df)}")
print(f"Total columns: {len(cleaned_df.columns)}")
print(f"Total null values: {cleaned_df.isnull().sum().sum()}")
print(f"Total duplicate rows: {cleaned_df.duplicated().sum()}")
print(f"Memory usage: {cleaned_df.memory_usage(deep=True).sum() / 1024:.2f} KB")

# Show the shape of cleaned dataset
print(f"\nCleaned dataset shape: {cleaned_df.shape}")
print(f"Columns in cleaned dataset: {list(cleaned_df.columns)}")

=== DATA QUALITY SUMMARY ===
Total rows: 73427
Total columns: 3
Total null values: 2
Total duplicate rows: 0
Memory usage: 21868.79 KB

Cleaned dataset shape: (73427, 3)
Columns in cleaned dataset: ['CODE', 'SHORT DESCRIPTION (VALID ICD-10 FY2025)', 'LONG DESCRIPTION (VALID ICD-10 FY2025)']


#### Apply TF-IDF Vectorization

In [38]:
# Convert the 'Short Description' column into TF-IDF vectors
vectorizer = TfidfVectorizer()
# return 2d array
tfidf_matrix = vectorizer.fit_transform(icd_data['SHORT DESCRIPTION (VALID ICD-10 FY2025)'])

# Show shape of TF-IDF matrix
print("TF-IDF matrix shape:", tfidf_matrix.shape)

TF-IDF matrix shape: (73427, 7790)


#### Test by matching user input

In [45]:
# Simulate user input
user_input = "Cholera"

# Vectorize user input
query_vec = vectorizer.transform([user_input])

# Compute cosine similarity with all ICD descriptions
sim_scores = cosine_similarity(query_vec, tfidf_matrix)[0]

# Get best match index and score
top_idx = sim_scores.argmax()
top_score = sim_scores[top_idx]

# Print results
if top_score >= 0.5:
    top_match = icd_data.iloc[top_idx]['SHORT DESCRIPTION (VALID ICD-10 FY2025)']
    top_code = icd_data.iloc[top_idx]['CODE']
    print("🔍 Top Match:", top_match)
    print("🧾 ICD Code:", top_code)
    print("📊 Similarity Score:", round(top_score, 4))
else:
    print(f"❌ No confident match found. Score = {top_score:.4f}")


🔍 Top Match: Cholera, unspecified
🧾 ICD Code: A009
📊 Similarity Score: 0.9588
