# Introduction

The project aims to develop natural language processing and machine learning techniques to 
automate the identification of patients with active Inflammatory Bowel Disease (IBD). The project 
will analyze text-based data from electronic health records (EHR) to identify keywords and multiword sequences associated with active IBD. The output from this process will be used to predict 
the likelihood of finding a patient with an active disease and modify the internal processes of the 
company to avoid manual review of images for some exams

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

In [2]:
#Loading all the datasets

# Load dataset 1
exam_notes_df = pd.read_csv('C:/Users/neeha/OneDrive/Documents/Northeastern University Assignmnets/Experiencial Learning Project/project/Exam_notes.csv')
# Load dataset 2
image_CD_df = pd.read_csv('C:/Users/neeha/OneDrive/Documents/Northeastern University Assignmnets/Experiencial Learning Project/project/Images CD.csv')
# Load dataset 3
image_UC_df = pd.read_csv('C:/Users/neeha/OneDrive/Documents/Northeastern University Assignmnets/Experiencial Learning Project/project/Images UC.csv')

# Load dataset 3
GI_chrons_df = pd.read_excel("C:/Users/neeha/OneDrive/Documents/Northeastern University Assignmnets/Experiencial Learning Project/project/GI responses - Crohn's disease.xlsx")

# Load dataset 4
GI_ulcerative_df = pd.read_excel("C:/Users/neeha/OneDrive/Documents/Northeastern University Assignmnets/Experiencial Learning Project/project/GI responses - Ulcerative collitis.xlsx")

## Exam Notes

In [3]:
# Function to parse exam notes
def parse_exam_notes(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')
    entries = soup.find_all('h3')

    result = {}
    for entry in entries:
        heading = entry.text.strip()
        ul_tags = entry.find_next_siblings('ul')

        item_text_list = []  # List to store both Item and Text

        for ul_tag in ul_tags:
            li_tags = ul_tag.find_all('li')
            for li_tag in li_tags:
                li_text = li_tag.text.strip()

                # Extract both Item and Text
                if li_text.startswith('Item:'):
                    split_text = li_text.split(': ', 1)
                    if len(split_text) >= 2:
                        item_text_list.append(split_text[1])
                if li_text.startswith('Text:'):
                    split_text = li_text.split(': ', 1)
                    if len(split_text) >= 2:
                        item_text_list.append(split_text[1])

        result[heading] = item_text_list

    first_heading = list(result.keys())[0]
    first_item_text_list = result[first_heading]

    # Convert the list of sentences into a single sentence
    first_sentence = ' '.join([sentence for sentence in first_item_text_list])

    return first_sentence


# Apply parsing function to each row in Exam Notes column
exam_notes_df['Exam Notes'] = exam_notes_df['Exam Notes'].apply(parse_exam_notes)

In [4]:
exam_notes_df = exam_notes_df.rename(columns={"Exam ID": "Exam_ID", 
                                              "Exam Notes": "Exam_Notes"})

In [5]:
exam_notes_df = exam_notes_df.drop_duplicates()

In [6]:
len(exam_notes_df['Exam_ID'].unique())

5161

In [7]:
exam_notes_df.shape

(5161, 2)

## Image CD

In [8]:
image_CD_df = image_CD_df.rename(columns={"Exam ID": "Exam_ID"})

In [9]:
image_CD_df['Image_Info'] = image_CD_df['Image Comments'].fillna('') + image_CD_df['Image Location Text'].fillna('')
image_CD_df = image_CD_df.drop(['Image Comments', 'Image Location Text'], axis=1)
image_CD_df.drop_duplicates()

Unnamed: 0,Exam_ID,Image_Info
0,1BABA39D733748FF820090BCB6FB504E,Ascending Colon
2,1BABA39D733748FF820090BCB6FB504E,Cecum
3,44DDBA011EDD4E3DBA2E86803FB418DA,Transverse Colon
4,44DDBA011EDD4E3DBA2E86803FB418DA,Descending Colon
8,44DDBA011EDD4E3DBA2E86803FB418DA,Descending Colon -lipoma
...,...,...
21719,bc4a275088d44b9e82999cc8cbe2df02,Transverse Colon
21720,bc4a275088d44b9e82999cc8cbe2df02,Splenic Flexure
21721,bc4a275088d44b9e82999cc8cbe2df02,Sigmoid Colon
21722,bc4a275088d44b9e82999cc8cbe2df02,Rectum


In [10]:
image_CD_df = image_CD_df.groupby('Exam_ID')['Image_Info'].agg(lambda x: ' '.join(x)).reset_index()

In [11]:
image_CD_df = image_CD_df.drop_duplicates()

In [12]:
len(image_CD_df['Exam_ID'].unique())

2146

In [13]:
image_CD_df.shape

(2146, 2)

## Image UC

In [14]:
image_UC_df = image_UC_df.rename(columns={"Exam ID": "Exam_ID"})

In [15]:
image_UC_df['Image_Info'] = image_UC_df['Image Comments'].fillna('') + image_UC_df['Image Location Text'].fillna('')
image_UC_df = image_UC_df.drop(['Image Comments', 'Image Location Text'], axis=1)
image_UC_df.drop_duplicates()

Unnamed: 0,Exam_ID,Image_Info
0,287276D88BAF4924ADCC80CBC85AF21F,Descending Colon
4,287276D88BAF4924ADCC80CBC85AF21F,Cecum
7,cf499e833b50464c9fe1371fc3e708b0,Appendiceal Orifice
8,cf499e833b50464c9fe1371fc3e708b0,Ileo-cecal Valve
9,cf499e833b50464c9fe1371fc3e708b0,Rectum
...,...,...
25920,4974d2dc90c34213b7d20b42100a3e3a,Transverse Colon
25923,4974d2dc90c34213b7d20b42100a3e3a,Descending Colon
25924,4974d2dc90c34213b7d20b42100a3e3a,Sigmoid Colon
25925,4974d2dc90c34213b7d20b42100a3e3a,Rectum


In [16]:
image_UC_df = image_UC_df.groupby('Exam_ID')['Image_Info'].agg(lambda x: ' '.join(x)).reset_index()

In [17]:
image_UC_df = image_UC_df.drop_duplicates()

In [18]:
len(image_UC_df['Exam_ID'].unique())

2823

In [19]:
image_UC_df.shape

(2823, 2)

## GI Response Chrons

In [20]:
GI_chrons_df.drop(['Do you have any other comments or feedback you would like to add? ',
       "Is this patient's disease confined to the ileum?",
       'Is this patient post-surgical?',
       'Does the quality of the images hinder you from accurately scoring this exam?',
       'Do you agree with the diagnosis of CD?',
       'Holistically speaking, do you judge this patient to be a likely candidate for inclusion in a clinical trial?',
       'Why, in brief?','What SES-CD score would you give this exam?'], axis = 1, inplace = True)

In [21]:
# Renaming the column using Rename function
GI_chrons_df = GI_chrons_df.rename(columns={"Exam ID": "Exam_ID", 
                                                    "How would you categorize this patient's disease severity?": "Disease_Severity",
                                                    "Did you detect ulcers in your review of this patient's case?": "Ulcer"})

In [22]:
GI_chrons_df = GI_chrons_df.drop_duplicates()

In [23]:
GI_chrons_df.shape

(396, 3)

## GI Response Ulcerative

In [24]:
GI_ulcerative_df.drop(['Is this patient post-surgical?',
       'Does the quality of the images hinder you from accurately scoring this exam?',
       'What MES would you give this exam?',
       'Please assess the extent of colonic involvement:',
       'Do you have any other comments or feedback you would like to add? ',
       'Do you agree with the diagnosis of UC?',
       'Holistically speaking, do you judge this patient to be a likely candidate for inclusion in a clinical trial?',
       'Why, in brief?',"Did you detect erosions in your review of this patient's case?"], axis = 1, inplace = True)

In [25]:
# Renaming the column using Rename function
GI_ulcerative_df = GI_ulcerative_df.rename(columns={"Exam ID": "Exam_ID", 
                                                    "How would you categorize this patient's disease severity?": "Disease_Severity",
                                                    "Did you detect ulcers in your review of this patient's case?": "Ulcer"})

In [26]:
GI_ulcerative_df = GI_ulcerative_df.drop_duplicates()

In [27]:
GI_ulcerative_df.shape

(398, 3)

In [28]:
# Specify the file path for the Excel file
exam_notes_csv_file_path = 'C:/Users/neeha/OneDrive/Documents/Northeastern University Assignmnets/Experiencial Learning Project/project/cleaned_exam_notes.csv'

# Write the DataFrame to the CSV file
exam_notes_df.to_csv(exam_notes_csv_file_path, index=False)

In [29]:
# Specify the file path for the Excel file
image_cd_csv_file_path = 'C:/Users/neeha/OneDrive/Documents/Northeastern University Assignmnets/Experiencial Learning Project/project/cleaned_image_cd.csv'

# Write the DataFrame to the CSV file
image_CD_df.to_csv(image_cd_csv_file_path, index=False)

In [30]:
# Specify the file path for the Excel file
image_UC_csv_file_path = 'C:/Users/neeha/OneDrive/Documents/Northeastern University Assignmnets/Experiencial Learning Project/project/cleaned_image_uc.csv'

# Write the DataFrame to the CSV file
image_UC_df.to_csv(image_UC_csv_file_path, index=False)

In [31]:
# Specify the file path for the Excel file
GI_chrons_csv_file_path = 'C:/Users/neeha/OneDrive/Documents/Northeastern University Assignmnets/Experiencial Learning Project/project/cleaned_GI_response_chrons.csv'

# Write the DataFrame to the CSV file
GI_chrons_df.to_csv(GI_chrons_csv_file_path, index=False)

In [32]:
# Specify the file path for the Excel file
GI_ulcerative_csv_file_path = 'C:/Users/neeha/OneDrive/Documents/Northeastern University Assignmnets/Experiencial Learning Project/project/cleaned_GI_response_ulcerative.csv'

# Write the DataFrame to the CSV file
GI_ulcerative_df.to_csv(GI_ulcerative_csv_file_path, index=False)