## Load the dataset

In [None]:
import pandas as pd
import nltk
import os
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


nltk.download('punkt')
nltk.download('stopwords')

def load_reports(directory):
    patient_reports = []
    for folder in os.listdir(directory):
        patient_folder = os.path.join(directory, folder)
        if os.path.isdir(patient_folder):
            for file in os.listdir(patient_folder):
                if file.endswith('.txt'):
                    file_path = os.path.join(patient_folder, file)
                    with open(file_path, 'r', encoding='utf-8') as file:
                        report = file.read()
                        patient_reports.append({'patient_id': folder, 'report': report})
    return pd.DataFrame(patient_reports)

directory = 'sample-patient'
reports_df = load_reports(directory)

# Text Preprocessing

In [None]:
# Text Processing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing to each report
reports_df['cleaned_report'] = reports_df['report'].apply(preprocess_text)

# Similarity Calculation

In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(reports_df['cleaned_report'])

similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

similarity_df = pd.DataFrame(similarity_matrix, index=reports_df['patient_id'], columns=reports_df['patient_id'])