In [8]:
import pandas as pd
import numpy as np
import os
import json
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer

# Enable tqdm progress bars
tqdm.pandas()

# File Paths
data_path = "data/"
output_path = "output/"
output_file_path = os.path.join(output_path, "merged_output_tdf.csv")

# Ensure output directory exists
os.makedirs(output_path, exist_ok=True)

# Target categories
categories = ['chd', 'cold', 'depr', 'diab', 'lung', 'pneu']

# TF-IDF Keyword Extraction Function
def extract_tfidf_keywords(text_series, top_n=10):
    """ Extracts top_n keywords for each row using TF-IDF """
    vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,2), max_df=0.9, min_df=2)
    
    # Transform text data
    tfidf_matrix = vectorizer.fit_transform(text_series.fillna(''))  
    feature_names = vectorizer.get_feature_names_out()
    
    keywords_list = []
    
    for i in range(tfidf_matrix.shape[0]):
        tfidf_scores = tfidf_matrix[i].toarray().flatten()
        top_indices = np.argsort(tfidf_scores)[::-1][:top_n]
        top_keywords = [feature_names[j] for j in top_indices if tfidf_scores[j] > 0]
        keywords_list.append(", ".join(top_keywords))
    
    return keywords_list

# Process CSV data and extract keywords
def process_category_to_jsonl():
    all_dfs = []

    # Load and merge all category data
    for category in categories:
        input_file = os.path.join(data_path, f"{category}_inter.csv")
        try:
            df = pd.read_csv(input_file, sep='\t', dtype={
                'pregnancy situation': int,
                'gender': int,
                'age': float,
                'height': float,
                'weight': float,
                'duration of illness': float
            })
            df["category"] = category 
            all_dfs.append(df)
        except Exception as e:
            print(f"Error reading {input_file}: {e}")
    
    # Merge all dataframes
    if not all_dfs:
        print("No data loaded. Exiting.")
        return
    
    df = pd.concat(all_dfs, ignore_index=True)

    # Extract keywords from `text_all_patient`
    df["keywords"] = extract_tfidf_keywords(df["text_all_patient"])

    # Final dataframe with necessary columns
    out_df = pd.DataFrame(columns=[
        'description', 'gender', 'age', 'height', 'weight',
        'pregnancy situation', 'duration of illness', 'category', 'disease', 'keywords'
    ])

    # Data processing
    try:
        out_df['description'] = df['text_all_patient'].fillna("")
        out_df[['age', 'height', 'weight', 'duration of illness']] = df[['age', 'height', 'weight', 'duration of illness']].fillna(0.0)
        out_df['gender'] = df['gender'].fillna(0)
        out_df['pregnancy situation'] = df['pregnancy situation'].fillna(0)
        out_df['category'] = df['category']  
        out_df['disease'] = df['disease_tag'].fillna("")
        out_df['keywords'] = df["keywords"]

    except Exception as e:
        print(f"Error during data processing: {e}")
        return

    # Save final results
    out_df.to_csv(output_file_path, index=False, encoding='utf-8')
    print(f"Final merged results saved: {output_file_path}")

# Main workflow
def main():
    process_category_to_jsonl()

if __name__ == "__main__":
    main()


Final merged results saved: output/merged_output_tdf.csv


In [3]:
import pandas as pd
import numpy as np
import os
import json
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer

# Enable tqdm progress bars
tqdm.pandas()

# File Paths
data_path = "data/"
output_path = "output/"
output_file_path = os.path.join(output_path, "merged_output_tdf_new.csv")

# Ensure output directory exists
os.makedirs(output_path, exist_ok=True)

# Target categories
categories = ['chd', 'cold', 'depr', 'diab', 'lung', 'pneu']

# TF-IDF Keyword Extraction Function
def extract_tfidf_keywords(text_series, top_n=10):
    """ Extracts top_n keywords for each row using TF-IDF """
    vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,2), max_df=0.9, min_df=2)
    
    # Transform text data
    tfidf_matrix = vectorizer.fit_transform(text_series.fillna(''))  
    feature_names = vectorizer.get_feature_names_out()
    
    keywords_list = []
    
    for i in range(tfidf_matrix.shape[0]):
        tfidf_scores = tfidf_matrix[i].toarray().flatten()
        top_indices = np.argsort(tfidf_scores)[::-1][:top_n]
        top_keywords = [feature_names[j] for j in top_indices if tfidf_scores[j] > 0]
        keywords_list.append(", ".join(top_keywords))
    
    return keywords_list

# Function to combine structured data into textual description
def combine_descriptions(row):
    """ Combines text_all_patient with structured data fields into a single description """
    base_text = row['text_all_patient'] if pd.notnull(row['text_all_patient']) else ""
    parts = []
    
    # Process gender (0: male, 1: female)
    gender = row['gender']
    if gender in [0, 1]:
        gender_str = 'male' if gender == 0 else 'female'
        parts.append(f"Gender: {gender_str}")
    
    # Process age (ignore 0 values)
    age = row['age']
    if age > 0:
        parts.append(f"Age: {age} years")
    
    # Process height (ignore 0 values)
    height = row['height']
    if height > 0:
        parts.append(f"Height: {height} cm")
    
    # Process weight (ignore 0 values)
    weight = row['weight']
    if weight > 0:
        parts.append(f"Weight: {weight} kg")
    
    # Process pregnancy situation (0: not pregnant, 1: pregnant)
    pregnancy = row['pregnancy situation']
    if pregnancy in [0, 1]:
        preg_str = 'not pregnant' if pregnancy == 0 else 'pregnant'
        parts.append(f"Pregnancy situation: {preg_str}")
    
    # Process duration of illness (ignore 0 values)
    duration = row['duration of illness']
    if duration > 0:
        parts.append(f"Duration of illness: {duration} days")
    
    # Combine all parts
    additional_info = ". ".join(parts)
    if additional_info:
        # Clean up existing punctuation
        base_text = base_text.strip()
        if base_text and not base_text.endswith('.'):
            base_text += '.'
        combined = f"{additional_info} {base_text}."
    else:
        combined = base_text
    
    return combined.strip()

# Process CSV data and extract keywords
def process_category_to_jsonl():
    all_dfs = []

    # Load and merge all category data
    for category in categories:
        input_file = os.path.join(data_path, f"{category}_inter.csv")
        try:
            df = pd.read_csv(input_file, sep='\t', dtype={
                'pregnancy situation': int,
                'gender': int,
                'age': float,
                'height': float,
                'weight': float,
                'duration of illness': float
            })
            df["category"] = category  # Add category column
            all_dfs.append(df)
        except Exception as e:
            print(f"Error reading {input_file}: {e}")
    
    # Merge all dataframes
    if not all_dfs:
        print("No data loaded. Exiting.")
        return
    
    df = pd.concat(all_dfs, ignore_index=True)

    # Extract keywords from the new description field
    df["keywords"] = extract_tfidf_keywords(df["text_all_patient"])
    
    # Generate combined description field
    df["description"] = df.progress_apply(combine_descriptions, axis=1)  # Using progress_apply for visual feedback

    
    # Create final output dataframe
    out_df = df[['description', 'category', 'disease_tag', 'keywords','age', 'height', 'weight', 'duration of illness','gender','pregnancy situation']].copy()
    out_df.rename(columns={'disease_tag': 'disease'}, inplace=True)
    
    # Handle missing values
    out_df.fillna({
        'description': '',
        'disease': '',
        'category': '',
        'keywords': ''
    }, inplace=True)

    # Save final results
    out_df.to_csv(output_file_path, index=False, encoding='utf-8')
    print(f"Final merged results saved: {output_file_path}")

# Main workflow
def main():
    process_category_to_jsonl()

if __name__ == "__main__":
    print('start')
    main()

start


100%|█████████████████████████████████████████████████████████████████████████| 30739/30739 [00:01<00:00, 21093.37it/s]


Final merged results saved: output/merged_output_tdf_new.csv
