This notebook contains code for infering if the disease mentioned in the patient's question is common cold or something else. This is to check, based on the stances, if there is any gender bias in doctor's response for common cold queries.

## Load libaries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
from spacy.matcher import PhraseMatcher

from spacy.matcher import Matcher
from google.colab import drive

import os


## Load `spaCy`'s language model and define common cold uni+bi-grams

In [None]:
# Load the pre-trained SpaCy model
nlp = spacy.load('en_core_web_sm')

# Define keywords that are expected in the symptoms for common cold
# NOTE: most of these keywords are generated using ChatGPT

common_cold_symptoms = [
    "cold",
    "cough",
    "sneeze",
    "fever",
    "headache",
    "congestion",
    "fatigue"
    # "Runny or stuffy nose",
    # "sore throat",
    # "common cold",
    # "Mild headache",
    # "fatigue",
    # "Mild body aches",
    # "Watery eyes",
    # "Mild chest discomfort"
]

common_cold_symptoms_bigram = [
    "runny nose",
    "stuffy nose",
    "sore throat",
    "muscle ache",
    "body ache",
    "watery eyes"
]

## Function defintion:
`detect_disease`: check the frequency of predefined common cold related words and bigrams in patient's question

In [None]:
def detect_common_cold(text):
  # if (len(text) > 0 and text is not np.nan):
  if text is not np.nan:
    doc = nlp(text)

    # Calculate the frequency of common cold related keywords in patients' response
    common_cold_freq = sum(1 for token in doc if token.lemma_ in common_cold_symptoms)

    # Calculate the frequency of common cold related bigrams in patients' response
    matcher = PhraseMatcher(nlp.vocab)
    patterns = [nlp(bigram) for bigram in common_cold_symptoms_bigram]
    matcher.add("BigramMatcher", None, *patterns)

    common_cold_bi_freq = len(matcher(doc))

    # return (common_cold_bi_freq)

    return (common_cold_freq + common_cold_bi_freq)
  else:
    return(0)


## Run the code on all files in the folder

In [None]:
# Mount Google Drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
%%time

in_folder_path = '/content/drive/MyDrive/Stance-files-2/'
out_folder_path = '/content/drive/MyDrive/Stance-n-Disease/'


for filename in os.listdir(in_folder_path):
  print(str("Procesing:" + in_folder_path + filename))

  df = pd.read_csv(in_folder_path + filename)

  # Check biased response
  df['Common_cold_freq_pat'] = df['Patient'].apply(detect_common_cold)

  # Save the DataFrame as a .csv file
  df.to_csv(out_folder_path + filename)

  print(str("DONE:" + out_folder_path + filename))


Procesing:/content/drive/MyDrive/Stance-files-2/AA-diagnose_en_dataset-Stance-70000.csv
DONE:/content/drive/MyDrive/Stance-n-Disease/AA-diagnose_en_dataset-Stance-70000.csv
CPU times: user 5min 44s, sys: 775 ms, total: 5min 45s
Wall time: 5min 49s
