<a href="https://colab.research.google.com/github/veselm73/BP/blob/main/diabetes_LLM_preproces.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
!pip install pandas tqdm scikit-learn nltk




In [22]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from tqdm import tqdm
import re
import nltk

nltk.download("stopwords")


class FastTFIDFKeywordExtractor:
    def __init__(self, top_n=5, custom_stopwords=None):
        self.top_n = top_n
        self.tokenizer = RegexpTokenizer(r"\w+")
        self.stemmer = PorterStemmer()
        self.stop_words = set(stopwords.words("english"))
        self.custom_stopwords = custom_stopwords or {
            "type", "unspecified", "complication", "disease", "stage",
            "syndrome", "disorder", "with", "without", "of"
        }

    def preprocess_texts(self, texts):
        return [
            re.sub(r"[^a-zA-Z0-9\s]", "", str(text).lower())
            for text in texts
        ]

    def extract_keywords_batch(self, corpus):
        preprocessed = self.preprocess_texts(corpus)

        # Fit TF-IDF
        vectorizer = TfidfVectorizer(
            tokenizer=self.tokenizer.tokenize,
            stop_words="english"
        )
        tfidf_matrix = vectorizer.fit_transform(preprocessed)
        feature_names = np.array(vectorizer.get_feature_names_out())

        results = []
        for i in tqdm(range(tfidf_matrix.shape[0]), desc="Extracting top keywords"):
            row = tfidf_matrix.getrow(i)
            scores = row.data
            indices = row.indices

            sorted_idx = np.argsort(scores)[::-1]
            sorted_terms = feature_names[indices[sorted_idx]]

            seen_stems = set()
            keywords = []

            for word in sorted_terms:
                stem = self.stemmer.stem(word)
                if (
                    word not in self.stop_words and
                    word not in self.custom_stopwords and
                    stem not in seen_stems
                ):
                    seen_stems.add(stem)
                    keywords.append(word)
                if len(keywords) >= self.top_n:
                    break

            results.append(" ".join(keywords))

        return results

    def extract_and_replace(self, df, columns):
        df_cleaned = df.copy()
        for col in columns:
            if col not in df_cleaned.columns:
                print(f"Column '{col}' not found.")
                continue

            print(f"\nExtracting keywords from: {col}")
            texts = df_cleaned[col].astype(str).tolist()
            df_cleaned[col] = self.extract_keywords_batch(texts)

        return df_cleaned


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
file_path = "https://kmlinux.fjfi.cvut.cz/~veselm73/diabetes_preprocessed.csv"
df = pd.read_csv(file_path, low_memory=False)

In [24]:
text_columns = [
    "diag_2_desc",
    "diag_3_desc",
    "primary_diag_desc"
]

extractor = FastTFIDFKeywordExtractor(top_n=5)

df_cleaned = extractor.extract_and_replace(df, text_columns)

df_cleaned.to_csv("diabetes_short_diag.csv", index=False)
print(len(df_cleaned))


Extracting keywords from: diag_2_desc


Extracting top keywords: 100%|██████████| 101766/101766 [00:24<00:00, 4194.96it/s]



Extracting keywords from: diag_3_desc


Extracting top keywords: 100%|██████████| 101766/101766 [00:15<00:00, 6412.82it/s]



Extracting keywords from: primary_diag_desc


Extracting top keywords: 100%|██████████| 101766/101766 [00:15<00:00, 6650.43it/s]


101766
