# Approach (FuzzyLogic)

In [3]:
df.head()

Unnamed: 0,Code,Disease
0,86152,Cell enumeration id
1,86153,Cell enumeration phys interp
2,86890,Autologous blood process
3,86891,Autologous blood op salvage
4,86927,Plasma fresh frozen


In [None]:

import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from fuzzywuzzy import fuzz

# Load the CSV file into a DataFrame
df = pd.read_csv("disease.csv")

# Create a dictionary to map disease names to their codes
disease_name_to_code = dict(zip(df["disease"], df["diseaseid"]))

# Load the pre-trained GPT-2 model and tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# User-provided input string
user_input = "Im having a severe head fresh ache seems like Plasma cell"

# Initialize variables to keep track of the best match
best_match_ratio = 0
best_matching_disease_code = None

# Tokenize the input string
input_ids = tokenizer.encode(user_input, return_tensors="pt")

# Search for the best-matching disease name within the input string
for disease_name in disease_name_to_code:
    match_ratio = fuzz.ratio(disease_name.lower(), user_input.lower())
    if match_ratio > best_match_ratio:
        best_match_ratio = match_ratio
        best_matching_disease_code = disease_name_to_code[disease_name]

# Generate a response with the best-matching disease code
if best_matching_disease_code:
    response = f"The best-matching disease code for your symptoms is: {best_matching_disease_code}."
else:
    response = "I couldn't find a best-matching disease code for your symptoms."

# Tokenize the response
response_ids = tokenizer.encode(response, return_tensors="pt")

# Generate a model response
output = model.generate(response_ids, max_length=100, num_return_sequences=1, no_repeat_ngram_size=2, top_k=50, top_p=0.95)

# Extract and decode the generated response
generated_response = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the generated response
print("Generated Response:")
print(generated_response)


In [None]:
df.head()

# CURRENT (Max Word Approach) --> Working

In [6]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer


df = pd.read_csv("diseaseOne.csv")


disease_name_to_code = dict(zip(df["Disease"], df["Code"]))


model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")


user_input = "Im having fever and i hate my Autologous blood op salvage life Plasma fresh and he is enumeration also good blood process"


max_matched_words = 0
best_matching_disease_code = None


user_input_tokens = user_input.lower().split()


for disease_name, disease_id in disease_name_to_code.items():
    disease_tokens = disease_name.lower().split()
    matched_words = len(set(user_input_tokens).intersection(disease_tokens))
    if matched_words > max_matched_words:
        max_matched_words = matched_words
        best_matching_disease_code = disease_id


if best_matching_disease_code:
    response = f"The disease code with the most matched words is: {best_matching_disease_code}."
else:
    response = "I couldn't find a matching disease code for your symptoms."


response_ids = tokenizer.encode(response, return_tensors="pt")


output = model.generate(response_ids, max_length=100, num_return_sequences=1, no_repeat_ngram_size=2, top_k=50, top_p=0.95)


generated_response = tokenizer.decode(output[0], skip_special_tokens=True)


print("Generated Response:")
print(generated_response)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Response:
The disease code with the most matched words is: 86891.

The most common form of the disease is the "C" word. The most commonly used form is "D" or "E" in the English language. In the United States, the word "disease" is used in a number of different ways. For example, in English, it is called "cervical cancer" because it causes a "cancerous tumor" on the cervix. It is


In [4]:
df.head(467)


Unnamed: 0,Code,Disease
0,86152,Cell enumeration id
1,86153,Cell enumeration phys interp
2,86890,Autologous blood process
3,86891,Autologous blood op salvage
4,86927,Plasma fresh frozen
...,...,...
462,73650,X-ray exam of heel
463,73660,X-ray exam of toe(s)
464,73700,Ct lower extremity w/o dye
465,73701,Ct lower extremity w/dye


# A Bit Of Data Cleaning

In [102]:
import pandas as pd
df = pd.read_csv('diseaseOne.csv')

In [103]:
#Dropping whole columns that has ALL Nan values
import pandas as pd
df = df.dropna(axis=1, how='all')


In [104]:
#Dropping Rows with Nan value (Whole row with whole na)
import pandas as pd
df = df.dropna()


In [106]:
# Removing operators/symbols from df
import pandas as pd
import re

def remove_specific_characters(text):
  # -<>/ NOT REMOVED as they provide meaning to the data
    cleaned_text = re.sub(r'[$%#&@!]', '', str(text))
    return cleaned_text


df = df.applymap(remove_specific_characters)


In [107]:

import pandas as pd
# Insert column names to the dataset to provide it meaning
df = pd.read_csv("diseaseOne.csv", header=None)
df.columns = ["Code", "Disease"]  




In [108]:

#Removed first 2 unmeaningful rows from DS
import pandas as pd

df = df.drop([0, 0]).reset_index(drop=True)



In [109]:
df.head(5)

Unnamed: 0,Code,Disease
0,86152,Cell enumeration id
1,86153,Cell enumeration phys interp
2,86890,Autologous blood process
3,86891,Autologous blood op salvage
4,86927,Plasma fresh frozen


In [110]:
#Saved/Update the csv
df.to_csv("diseaseOne.csv", index=False) 

In [111]:
gt=pd.read_csv('diseaseOne.csv')

In [121]:
gt.head(917)

Unnamed: 0,Code,Disease
0,86152,Cell enumeration id
1,86153,Cell enumeration phys interp
2,86890,Autologous blood process
3,86891,Autologous blood op salvage
4,86927,Plasma fresh frozen
...,...,...
912,77401,Radiation treatment delivery
913,77402,Radiation treatment delivery
914,77407,Radiation treatment delivery
915,77412,Radiation treatment delivery


In [120]:
gt.shape

(1101, 2)

# Search In Model & CSV (With Max Word)  --> Working

In [7]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer


df = pd.read_csv("diseaseOne.csv")

disease_name_to_code = dict(zip(df["Disease"], df["Code"]))


model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")


user_input = "Im having fever and i hate my Cell enumeration phys interp life Plasma fresh frozen and he is enumeration also good blood process"


max_matched_words = 0
best_matching_disease_code = None


user_input_tokens = user_input.lower().split()


for disease_name, disease_id in disease_name_to_code.items():
    disease_tokens = disease_name.lower().split()
    matched_words = len(set(user_input_tokens).intersection(disease_tokens))
    if matched_words > max_matched_words:
        max_matched_words = matched_words
        best_matching_disease_code = disease_id


if not best_matching_disease_code:
    model_input = "Do you have information about " + user_input
    input_ids = tokenizer.encode(model_input, return_tensors="pt")
    model_response = model.generate(input_ids, max_length=100, num_return_sequences=1, no_repeat_ngram_size=2, top_k=50, top_p=0.95)
    model_response_text = tokenizer.decode(model_response[0], skip_special_tokens=True)

   
    for disease_name in disease_name_to_code:
        if disease_name.lower() in model_response_text.lower():
            best_matching_disease_code = disease_name_to_code[disease_name]
            break


if best_matching_disease_code:
    response = f"The disease code with the most matched words is: {best_matching_disease_code}."
else:
    response = "I couldn't find a matching disease code for your symptoms."


print("Generated Response:")
print(response)


Generated Response:
The disease code with the most matched words is: 86153.


# Training GPT-2 (Finetuning With Dataset)

In [1]:
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Load your CSV data using pandas
df = pd.read_csv("diseaseOne.csv")  # Replace "diseaseOne.csv" with the path to your CSV file

# Extract the text data and labels from the CSV
data = df["Disease"].tolist()  # Replace "Disease" with the name of the text column in your CSV
labels = df["Code"].tolist()  # Replace "Code" with the name of the label column in your CSV

# Tokenize the data using the model's tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
data = [tokenizer.encode(text) for text in data]

# Flatten the tokenized data into a single list
flat_data = [item for sublist in data for item in sublist]

# Create a dataset file with tokenized data
with open("text_dataset.txt", "w", encoding="utf-8") as file:
    for text_ids in data:
        text = tokenizer.decode(text_ids)
        file.write(text + "\n")

# Create a PyTorch dataset
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="text_dataset.txt",
    block_size=128,
)

# Create a data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Load the pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./fine-tuned-gpt2",
    overwrite_output_dir=True,
    num_train_epochs=1,  # Adjust this as needed
    per_device_train_batch_size=4,  # Adjust this as needed
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("fine-tuned-gpt2")
tokenizer.save_pretrained("fine-tuned-gpt2")




Step,Training Loss


('fine-tuned-gpt2\\tokenizer_config.json',
 'fine-tuned-gpt2\\special_tokens_map.json',
 'fine-tuned-gpt2\\vocab.json',
 'fine-tuned-gpt2\\merges.txt',
 'fine-tuned-gpt2\\added_tokens.json')

# Testing GPT-2 With Disease Name As Input

In [12]:
# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained("fine-tuned-gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("fine-tuned-gpt2")

# User-provided input disease name
user_input_disease_name = "Autologous blood op salvage "

# Generate a response
input_ids = tokenizer.encode(user_input_disease_name, return_tensors="pt")
output = model.generate(input_ids, max_length=200, num_return_sequences=1, no_repeat_ngram_size=2, top_k=50, top_p=0.95)

# Extract and decode the generated response
generated_response = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the generated response
print("Generated Response:")
print(generated_response)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Response:
Autologous blood op salvage ileum

Liver transplant
Cerebrovascular transplant ial
Maternal transplant (cov)
Fetal transplant/corticosteroid
Tumor transplantation
Nuclear transplant transplant or cortical transplant, or
Neoplastic transplant(s) 
Vascular implant
Ectopic transplant of the brain
Brain transplant with a brain transplant and/or
Bilateral brain implant ia
Gastrointestinal transplant in the abdomen
Hematocrit transplant to the liver
Oral transplant for the kidney
Pulmonary transplant on the thorax
Radiologic transplant at the lung
Surgical transplant after a pulmonary transplant


In [7]:
df = pd.read_csv('diseaseOne.csv')

In [14]:
df.head()

Unnamed: 0,Code,Disease
0,86152,Cell enumeration id
1,86153,Cell enumeration phys interp
2,86890,Autologous blood process
3,86891,Autologous blood op salvage
4,86927,Plasma fresh frozen


# Traning Bert For Prediction Now

In [16]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder

# Load your CSV data using pandas
df = pd.read_csv("diseaseOne.csv")  # Replace with the path to your CSV file

# Extract the text data and labels from the CSV
disease_names = df["Disease"].tolist()
disease_codes = df["Code"].tolist()

# Tokenize the disease names
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenized_inputs = tokenizer(disease_names, truncation=True, padding=True, return_tensors="pt")

# Use LabelEncoder to encode labels as integers
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(disease_codes)

# Define the fine-tuned BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_encoder.classes_))

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./fine-tuned-bert",
    overwrite_output_dir=True,
    num_train_epochs=1,  # Adjust as needed
    per_device_train_batch_size=4,  # Adjust as needed
    save_steps=10_000,
    save_total_limit=2,
)

# Define a function to properly collate data
def collate_fn(data):
    input_ids, labels = zip(*data)
    return {
        'input_ids': torch.stack(input_ids),
        'labels': torch.tensor(labels)
    }

# Initialize a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    train_dataset=list(zip(tokenized_inputs["input_ids"], labels)),
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("fine-tuned-bert")
tokenizer.save_pretrained("fine-tuned-bert")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss


('fine-tuned-bert\\tokenizer_config.json',
 'fine-tuned-bert\\special_tokens_map.json',
 'fine-tuned-bert\\vocab.txt',
 'fine-tuned-bert\\added_tokens.json')

In [17]:
df.head()

Unnamed: 0,Code,Disease
0,86152,Cell enumeration id
1,86153,Cell enumeration phys interp
2,86890,Autologous blood process
3,86891,Autologous blood op salvage
4,86927,Plasma fresh frozen


# Testing Bert With Disease Name As Input

In [6]:
from sklearn.preprocessing import LabelEncoder
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification

# Load your CSV data using pandas
df = pd.read_csv("diseaseOne.csv")  # Replace with the path to your CSV file

# Extract the text data and labels from the CSV
disease_names = df["Disease"].tolist()
disease_codes = df["Code"].tolist()

# Train or load your BERT model here

# Initialize a label encoder
label_encoder = LabelEncoder()
label_encoder.fit(disease_codes)

# Define a function to predict the disease code
def predict_disease_code(disease_name):
    # Tokenize the input text
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    inputs = tokenizer(disease_name, return_tensors="pt")

    # Forward pass through your BERT model
    outputs = model(**inputs)
    logits = outputs.logits

    # Get the predicted label
    predicted_label = torch.argmax(logits, dim=1).item()

    # Reverse the label encoding to get the disease code
    predicted_disease_code = label_encoder.inverse_transform([predicted_label])[0]

    return predicted_disease_code

# Example usage:
disease_name = "Heart first pass multiple"
predicted_code = predict_disease_code(disease_name)
print(f"Disease Name: {disease_name}")
print(f"Predicted Disease Code: {predicted_code}")


Disease Name: Heart first pass multiple
Predicted Disease Code: A9570


In [7]:
df.head(667)


Unnamed: 0,Code,Disease
0,86152,Cell enumeration id
1,86153,Cell enumeration phys interp
2,86890,Autologous blood process
3,86891,Autologous blood op salvage
4,86927,Plasma fresh frozen
...,...,...
662,78472,Gated heart planar single
663,78473,Gated heart multiple
664,78481,Heart first pass single
665,78483,Heart first pass multiple
