In [None]:
import pandas as pd
# Load the data
data = pd.read_csv("gpt-4.csv", engine='python', on_bad_lines='skip')
print(data.head())

                                                data  \
0  This 60-year-old male was hospitalized due to ...   
1  A 39-year-old man was hospitalized due to an i...   
2  One week after a positive COVID-19 result this...   
3  This 69-year-old male was admitted to the ICU ...   
4  This 57-year-old male was admitted to the ICU ...   

                                        conversation  
0  Doctor: Good morning, how are you feeling toda...  
1  Doctor: Hello, how are you feeling today?\nPat...  
2  Doctor: Hello, how are you feeling today?\nPat...  
3  Doctor: Hello, how are you feeling today?\nPat...  
4  Doctor: Good morning, how are you feeling toda...  


In [None]:
!pip install dash jupyter-dash


Collecting dash
  Downloading dash-2.18.2-py3-none-any.whl.metadata (10 kB)
Collecting jupyter-dash
  Downloading jupyter_dash-0.4.2-py3-none-any.whl.metadata (3.6 kB)
Collecting Flask<3.1,>=1.0.4 (from dash)
  Downloading flask-3.0.3-py3-none-any.whl.metadata (3.2 kB)
Collecting Werkzeug<3.1 (from dash)
  Downloading werkzeug-3.0.6-py3-none-any.whl.metadata (3.7 kB)
Collecting dash-html-components==2.0.0 (from dash)
  Downloading dash_html_components-2.0.0-py3-none-any.whl.metadata (3.8 kB)
Collecting dash-core-components==2.0.0 (from dash)
  Downloading dash_core_components-2.0.0-py3-none-any.whl.metadata (2.9 kB)
Collecting dash-table==5.0.0 (from dash)
  Downloading dash_table-5.0.0-py3-none-any.whl.metadata (2.4 kB)
Collecting retrying (from dash)
  Downloading retrying-1.3.4-py3-none-any.whl.metadata (6.9 kB)
Collecting ansi2html (from jupyter-dash)
  Downloading ansi2html-1.9.2-py3-none-any.whl.metadata (3.7 kB)
Collecting jedi>=0.16 (from ipython->jupyter-dash)
  Downloading je

In [None]:
!pip install dash plotly wordcloud




In [None]:
!pip install dash-bootstrap-components

Collecting dash-bootstrap-components
  Downloading dash_bootstrap_components-1.7.1-py3-none-any.whl.metadata (17 kB)
Downloading dash_bootstrap_components-1.7.1-py3-none-any.whl (229 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m229.3/229.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dash-bootstrap-components
Successfully installed dash-bootstrap-components-1.7.1


In [None]:
from dash import dcc, html, Input, Output, Dash
import pandas as pd
import plotly.express as px
from collections import Counter
from wordcloud import WordCloud
import io
import base64
import dash_bootstrap_components as dbc

# Load the data
try:
    data = pd.read_csv("gpt-4.csv", engine='python', on_bad_lines='skip')
except Exception as e:
    data = pd.DataFrame({"conversation": [], "other_columns": []})  # Empty placeholder

# Initialize app
app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

# Default keywords
default_symptoms = ["fever", "cough", "pain", "shortness of breath", "headache", "fatigue", "dizziness", "nausea"]
default_diseases = ["COVID-19", "diabetes", "hypertension", "asthma", "malaria", "pneumonia", "flu", "cancer"]

# Layout
app.layout = dbc.Container([
    html.H1("Doctor-Patient Conversations Dashboard", style={'textAlign': 'center', 'margin-bottom': '30px', 'color': '#2c3e50'}),

    # Input fields for symptoms and diseases
    dbc.Row([
        dbc.Col([
            html.Label("Enter Symptoms (comma-separated):", style={'fontWeight': 'bold', 'margin-bottom': '10px'}),
            dcc.Input(
                id="symptom-input", type="text",
                placeholder="e.g., fever, cough",
                value=", ".join(default_symptoms),
                style={"width": "100%", "padding": "10px", "borderRadius": "5px", "border": "1px solid #ccc"}
            ),
        ], width=6),
        dbc.Col([
            html.Label("Enter Diseases (comma-separated):", style={'fontWeight': 'bold', 'margin-bottom': '10px'}),
            dcc.Input(
                id="disease-input", type="text",
                placeholder="e.g., COVID-19, diabetes",
                value=", ".join(default_diseases),
                style={"width": "100%", "padding": "10px", "borderRadius": "5px", "border": "1px solid #ccc"}
            ),
        ], width=6),
    ], style={'margin-bottom': '20px'}),

    # Summary statistics and findings
    dbc.Row([
        dbc.Col([
            dbc.Card([
                dbc.CardHeader("Key Statistics", style={'backgroundColor': '#3498db', 'color': 'white', 'fontWeight': 'bold'}),
                dbc.CardBody(id="statistics", style={'padding': '20px'})
            ], style={'margin-bottom': '20px', 'boxShadow': '0 4px 8px rgba(0,0,0,0.1)', 'border': 'none'})
        ], width=6),
        dbc.Col([
            dbc.Card([
                dbc.CardHeader("Key Insights", style={'backgroundColor': '#2ecc71', 'color': 'white', 'fontWeight': 'bold'}),
                dbc.CardBody(id="findings-summary", style={'padding': '20px'})
            ], style={'margin-bottom': '20px', 'boxShadow': '0 4px 8px rgba(0,0,0,0.1)', 'border': 'none'})
        ], width=6),
    ]),

    # Charts
    dbc.Row([
        dbc.Col([dcc.Graph(id="symptom-chart")], width=6),
        dbc.Col([dcc.Graph(id="disease-chart")], width=6),
    ], style={'margin-bottom': '20px'}),

    # Word cloud
    dbc.Row([
        dbc.Col([
            html.H3("Disease Word Cloud", style={'textAlign': 'center', 'margin-top': '20px', 'color': '#34495e'}),
            html.Div(
                [html.Img(id="wordcloud", style={'width': '100%', 'borderRadius': '10px', 'boxShadow': '0 4px 8px rgba(0,0,0,0.1)'})],
                style={'textAlign': 'center', 'margin-top': '20px'}
            ),
        ], width=12),
    ]),
], fluid=True)


# Callback for updating dashboard
@app.callback(
    [Output("statistics", "children"),
     Output("symptom-chart", "figure"),
     Output("disease-chart", "figure"),
     Output("findings-summary", "children"),
     Output("wordcloud", "src")],
    [Input("symptom-input", "value"),
     Input("disease-input", "value")]
)
def update_dashboard(symptom_input, disease_input):
    if data.empty:
        return "No data loaded.", go.Figure(), go.Figure(), "No insights available.", ""

    # Process user inputs
    symptom_keywords = [s.strip() for s in symptom_input.split(",") if s.strip()]
    disease_keywords = [d.strip() for d in disease_input.split(",") if d.strip()]

    # Extract symptoms and diseases
    data['extracted_symptoms'] = data['conversation'].apply(
        lambda x: [symptom for symptom in symptom_keywords if symptom in x.lower()]
    )
    data['extracted_diseases'] = data['conversation'].apply(
        lambda x: [disease for disease in disease_keywords if disease in x.lower()]
    )

    # Statistics
    total_conversations = len(data)
    unique_symptoms = set([symptom for sublist in data['extracted_symptoms'] for symptom in sublist])
    unique_diseases = set([disease for sublist in data['extracted_diseases'] for disease in sublist])

    stats = html.Ul([
        html.Li(f"Total Conversations: {total_conversations}"),
        html.Li(f"Unique Symptoms Identified: {len(unique_symptoms)}"),
        html.Li(f"Unique Diseases Identified: {len(unique_diseases)}")
    ])

    # Charts
    symptom_counts = Counter([symptom for sublist in data['extracted_symptoms'] for symptom in sublist])
    symptom_fig = px.bar(
        x=list(symptom_counts.keys()), y=list(symptom_counts.values()),
        labels={"x": "Symptom", "y": "Frequency"}, title="Most Common Symptoms"
    )

    disease_counts = Counter([disease for sublist in data['extracted_diseases'] for disease in sublist])
    disease_fig = px.bar(
        x=list(disease_counts.keys()), y=list(disease_counts.values()),
        labels={"x": "Disease", "y": "Frequency"}, title="Most Common Diseases"
    )

    # Findings summary
    findings_summary = html.Ul([
        html.Li(f"Total Conversations: {total_conversations}"),
        html.Li(f"Unique Symptoms Identified: {len(unique_symptoms)}"),
        html.Li(f"Unique Diseases Identified: {len(unique_diseases)}")
    ])

    # Word Cloud
    disease_text = " ".join([disease for sublist in data['extracted_diseases'] for disease in sublist])
    wordcloud = WordCloud(width=800, height=400).generate(disease_text)

    img = io.BytesIO()
    wordcloud.to_image().save(img, format='PNG')
    img.seek(0)
    img_b64 = base64.b64encode(img.read()).decode('utf-8')

    return stats, symptom_fig, disease_fig, findings_summary, f"data:image/png;base64,{img_b64}"


if __name__ == "__main__":
    app.run_server(debug=True, port=8056, host='0.0.0.0')


<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from transformers import pipeline, DistilBertTokenizer, DistilBertForSequenceClassification

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# @title processed dataset for risk and urgency levels

# Load the dataset
df = pd.read_csv("gpt-4.csv", engine='python', on_bad_lines='skip')
data = pd.DataFrame(df)

# Define more comprehensive keywords for risk and urgency levels
low_risk_keywords = ["mild", "better", "normal", "routine", "no symptoms"]
medium_risk_keywords = ["fever", "cough", "body aches", "headache", "fatigue", "chills"]
high_risk_keywords = ["difficulty breathing", "chest pain", "emergency", "severe", "critical", "intense", "unconscious", "high fever"]

non_urgent_keywords = ["check-up", "follow-up", "routine", "monitor"]
urgent_keywords = ["severe", "noticeable symptoms", "trouble breathing", "headache worsening", "nausea", "dizziness"]
critical_keywords = ["life-threatening", "ICU", "critical", "can't move", "unresponsive", "hospitalization"]

# Load the tokenizer for the DistilBERT model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
sentiment_model = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", tokenizer=tokenizer)

# Helper function to truncate long texts using the tokenizer
def truncate_text(text, max_length=512):
    inputs = tokenizer(text, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
    return tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)

# Function to classify risk, urgency, and sentiment (added more detailed classifications)
def classify_risk_urgency(conversation):
    # Handle empty or missing conversations
    if pd.isna(conversation) or conversation.strip() == "":
        return "Unknown", "Unknown", "Neutral"

    # Truncate the conversation to avoid token length issues
    conversation = truncate_text(conversation, max_length=512)

    # Classify risk level based on keyword frequency and intensity
    risk_score = 0
    if any(word in conversation.lower() for word in high_risk_keywords):
        risk_score += 3  # Higher score for critical keywords
    if any(word in conversation.lower() for word in medium_risk_keywords):
        risk_score += 2
    if any(word in conversation.lower() for word in low_risk_keywords):
        risk_score += 1

    if risk_score >= 5:
        risk = "High Risk"
    elif risk_score == 3 or risk_score == 4:
        risk = "Medium Risk"
    else:
        risk = "Low Risk"

    # Classify urgency level based on keyword matching
    urgency_score = 0
    if any(word in conversation.lower() for word in critical_keywords):
        urgency_score += 3
    if any(word in conversation.lower() for word in urgent_keywords):
        urgency_score += 2
    if any(word in conversation.lower() for word in non_urgent_keywords):
        urgency_score += 1

    if urgency_score >= 5:
        urgency = "Critical"
    elif urgency_score == 3 or urgency_score == 4:
        urgency = "Urgent"
    else:
        urgency = "Non-Urgent"

    # Perform sentiment analysis to aid with urgency classification
    sentiment_result = sentiment_model(conversation)
    sentiment = sentiment_result[0]['label'] if sentiment_result else "Neutral"

    return risk, urgency, sentiment

# Apply the classification function to the dataset
data[["Risk", "Urgency", "Sentiment"]] = data["conversation"].apply(
    lambda x: pd.Series(classify_risk_urgency(x))
)

# Save the labeled dataset to a new CSV file
data.to_csv("labeled_dataset_with_sentiment1.csv", index=False)

# Display the labeled dataset
print(data.head())




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Device set to use cpu


                                                data  \
0  This 60-year-old male was hospitalized due to ...   
1  A 39-year-old man was hospitalized due to an i...   
2  One week after a positive COVID-19 result this...   
3  This 69-year-old male was admitted to the ICU ...   
4  This 57-year-old male was admitted to the ICU ...   

                                        conversation extracted_symptoms  \
0  Doctor: Good morning, how are you feeling toda...     [fever, cough]   
1  Doctor: Hello, how are you feeling today?\nPat...     [fever, cough]   
2  Doctor: Hello, how are you feeling today?\nPat...                 []   
3  Doctor: Hello, how are you feeling today?\nPat...            [cough]   
4  Doctor: Good morning, how are you feeling toda...     [fever, cough]   

  extracted_diseases       Risk     Urgency Sentiment  
0                 []  High Risk  Non-Urgent  POSITIVE  
1                 []  High Risk  Non-Urgent  POSITIVE  
2                 []   Low Risk      Urgent 

In [None]:
!pip install pandas plotly




In [None]:
import pandas as pd
# Load the data
data = pd.read_csv("labeled_dataset_with_sentiment.csv", engine='python', on_bad_lines='skip')
print(data.head())

                                                data  \
0  This 60-year-old male was hospitalized due to ...   
1  A 39-year-old man was hospitalized due to an i...   
2  One week after a positive COVID-19 result this...   
3  This 69-year-old male was admitted to the ICU ...   
4  This 57-year-old male was admitted to the ICU ...   

                                        conversation       Risk     Urgency  \
0  Doctor: Good morning, how are you feeling toda...  High Risk  Non-Urgent   
1  Doctor: Hello, how are you feeling today?\nPat...  High Risk  Non-Urgent   
2  Doctor: Hello, how are you feeling today?\nPat...   Low Risk      Urgent   
3  Doctor: Hello, how are you feeling today?\nPat...  High Risk    Critical   
4  Doctor: Good morning, how are you feeling toda...   Low Risk      Urgent   

  Sentiment  
0  POSITIVE  
1  POSITIVE  
2  POSITIVE  
3  NEGATIVE  
4  NEGATIVE  


**Dashboard for processed dataset**

In [None]:
import dash
from dash import dcc, html
import dash_bootstrap_components as dbc
import pandas as pd
import plotly.express as px

# Load the labeled dataset
data = pd.read_csv("labeled_dataset_with_sentiment1.csv")

# Generate plots
risk_distribution = px.bar(
    data,
    x="Risk",
    title="Distribution of Risk Levels",
    labels={"Risk": "Risk Level", "count": "Number of Conversations"},
    color="Risk",
)

urgency_distribution = px.bar(
    data,
    x="Urgency",
    title="Distribution of Urgency Levels",
    labels={"Urgency": "Urgency Level", "count": "Number of Conversations"},
    color="Urgency",
)

combined_distribution = px.bar(
    data,
    x="Risk",
    color="Urgency",
    barmode="group",
    title="Combined Distribution of Risk and Urgency Levels",
    labels={"count": "Number of Conversations", "Risk": "Risk Level"},
)

sentiment_distribution = px.pie(
    data,
    names="Sentiment",
    title="Sentiment Distribution",
)

# Initialize the Dash app
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

# Layout for the dashboard
app.layout = dbc.Container([
    html.H1("Doctor-Patient Conversations Dashboard",
            style={'textAlign': 'center', 'margin-bottom': '30px', 'color': '#2c3e50'}),
    dbc.Row([
        dbc.Col([
            html.H4("Distribution of Risk Levels", style={'textAlign': 'center'}),
            dcc.Graph(figure=risk_distribution),
        ], width=6),
        dbc.Col([
            html.H4("Distribution of Urgency Levels", style={'textAlign': 'center'}),
            dcc.Graph(figure=urgency_distribution),
        ], width=6),
    ], className="mb-4"),
    dbc.Row([
        dbc.Col([
            html.H4("Combined Risk and Urgency Levels", style={'textAlign': 'center'}),
            dcc.Graph(figure=combined_distribution),
        ], width=12),
    ], className="mb-4"),
    dbc.Row([
        dbc.Col([
            html.H4("Sentiment Distribution", style={'textAlign': 'center'}),
            dcc.Graph(figure=sentiment_distribution),
        ], width=12),
    ]),
], fluid=True)

# Run the Dash app
if __name__ == "__main__":
    app.run_server(debug=True)


<IPython.core.display.Javascript object>