In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from nltk.corpus import stopwords, words
import string
import pandas as pd
from string import punctuation
punctuation = set(punctuation)
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import nbimporter
from scipy.stats import skew
from tabulate import tabulate
from collections import Counter
from IPython.display import display, Image
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import os

# needed to import country codes generated in the processing file because it is an input in one of our functions


# load NLTK words corpus for English

english_words = set(words.words())

In [2]:
def process_text(text):
    '''Function to process text fields.
    Involves removing punctuation, tokenizing text, removing stopwords, lemmatizing tokens, folding to lowercase, removing any words 
    that are not in NLTK's word dictionary.'''
    # Define punctuation set
    punctuation = set(string.punctuation)
    # define words
    english_words = set(words.words())
    # Add additional punctuation character
    additional_punctuation = {'‘', '—', '“', '«'}
    punctuation.update(additional_punctuation)

    # Tokenize text using NLTK's word_tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords and punctuation, and lemmatize tokens
    sw = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    tokens = [
        lemmatizer.lemmatize(token.lower())
        for token in tokens
        if token.lower() not in sw and all(char not in punctuation for char in token)
        # remove numerical tokens   
        and not token.isdigit()
        # remove tokens with just one character
        and len(token) > 1 
        and token not in {
            'department', 'health', 'public', 'food', 'drug', 'administration',
            'release', 'report', 'research', 'methodology', 'approach', 'certain',
            'energy', 'commission', 'ultimately', 'finding', 'investigation', 'also',
            'available', 'center', 'disease', 'control', 'us', 'federal', 'authority',
            'rounding', 'register', 'determine', 'absence', 'presence', 'de', 'use',
            'unless', 'work', 'article', 'editor', 'publication', 'since', 'upon',
            'many', 'meet', 'every', 'one', 'two', 'three', 'four', 'five', 'six',
            'seven', 'eight', 'ago', 'name', 'address'
        }
    ]

    # check if tokens are in NLTK's word list - do not include, if not
    tokens_in_dictionary = [
        token
        for token in tokens
        if token in english_words
    ]
    
    return tokens_in_dictionary

In [3]:
# process text with 2-token n-grams for language context
def process_text_grams(text):
    # Define punctuation set
    punctuation = set(string.punctuation)
    # Add additional punctuation character
    punctuation.update({'‘'})

    # Tokenize text using NLTK's word_tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords and punctuation, and lemmatize tokens
    sw = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    tokens = [
        lemmatizer.lemmatize(token.lower())
        for token in tokens
        if token.lower() not in sw and all(char not in punctuation for char in token)
    ]
    
    # Generate bigrams
    bigrams = list(ngrams(tokens, 2))
    
    # Combine tokens and bigrams into one list
    combined_tokens = tokens + [' '.join(bigram) for bigram in bigrams]
    
    return combined_tokens

In [2]:
# Define the text processing function for the drug labels specifically
def process_label_text(text):
    if isinstance(text, str):  # Check if the input is a string
        # Define punctuation set
        punctuation = set(string.punctuation)
        
        # Replace punctuation with spaces
        for p in punctuation:
            text = text.replace(p, ' ')
        
        # Tokenize text using NLTK's word_tokenize
        tokens = word_tokenize(text)

        # Remove stopwords and lemmatize tokens
        sw = set(stopwords.words("english"))
        lemmatizer = WordNetLemmatizer()
        tokens = [
            lemmatizer.lemmatize(token.lower())
            for token in tokens
            if token.lower() not in sw
        ]

        # Remove duplicate tokens while maintaining order
        seen = set()
        unique_tokens = []
        for token in tokens:
            if token not in seen:
                seen.add(token)
                unique_tokens.append(token)

        # Remove "nan" tokens if present
        unique_tokens = [token for token in unique_tokens if token != 'nan']

        # If the resulting list is empty, return pd.NA
        if not unique_tokens:
            return pd.NA
        
        return unique_tokens  # Return the list of tokens
    else:
        return text  # Return the original value if it's not a string

In [1]:
# Define the text processing function for the drug labels specifically
def process_label_text_grams(text):
    if isinstance(text, str):  # Check if the input is a string
        # Define punctuation set
        punctuation = set(string.punctuation)
        
        # Replace punctuation with spaces
        for p in punctuation:
            text = text.replace(p, ' ')
        
        # Tokenize text using NLTK's word_tokenize
        tokens = word_tokenize(text)

        # Remove stopwords and lemmatize tokens
        sw = set(stopwords.words("english"))
        lemmatizer = WordNetLemmatizer()
        tokens = [
            lemmatizer.lemmatize(token.lower())
            for token in tokens
            if token.lower() not in sw
        ]

        # Remove duplicate tokens while maintaining order
        seen = set()
        unique_tokens = []
        for token in tokens:
            if token not in seen:
                seen.add(token)
                unique_tokens.append(token)

        # Remove "nan" tokens if present
        unique_tokens = [token for token in unique_tokens if token != 'nan']

        # If the resulting list is empty, return pd.NA
        if not unique_tokens:
            return pd.NA

         # Generate bigrams
        bigrams = list(ngrams(unique_tokens, 2))
        
        # Combine tokens and bigrams into one list
        combined_tokens = unique_tokens + [' '.join(bigram) for bigram in bigrams]
        
        return combined_tokens  # Return the list of tokens
   
    else:
        return text  # Return the original value if it's not a string

In [6]:
# Create function to make unique IDs for each table
def add_sequential_index(df, index_col_name):

    # Reset the index and rename the index column to input index_col_name
    df = df.reset_index().rename(columns={"index": index_col_name})
    
    # Add 1 to index to start index from 1 instead of 0
    df[index_col_name] = df[index_col_name] + 1
    
    return df

In [7]:
# Function to return count of NaN and proportion of NaN in each column for a dataframe
def nan_info(df):
    # Count # of NA values
    nan_counts = df.isna().sum()
    
    # Calculate proportion of NA values
    prop_null = (nan_counts / len(df)) * 100
    
    # Create a DataFrame to store the information
    nan_info = pd.DataFrame({
        'column_name': nan_counts.index,
        'null_count': nan_counts.values,
        'null_proportion': prop_null.values
    })
    
    return nan_info

In [8]:
# Missing values to null for now (simplifies type conversions & plotting)
def na_to_null(df, column):
    df[column] = df[column].replace('N/A', np.nan)
    return df

In [9]:
# Function to remove duplicates
def remove_duplicates(tokens):
    return list(set(tokens))

In [10]:
# Function to remove duplicates and handle NaNs
def remove_duplicates_nan(tokens):
    if isinstance(tokens, list):
        # Remove "nan" tokens if present
        tokens = [token for token in tokens if token != 'nan']
        
        # Return pd.NA if the list is empty after removing "nan" tokens
        if not tokens:
            return pd.NA
        return list(set(tokens))
    else:
        return tokens

In [11]:
# Function to classify the product type
def classify_product_type(product_types):
    if 'human otc' in product_types:
        return 2
    elif 'human prescription' in product_types:
        return 1
    else:
        return 0

In [12]:
# Process medrap reaction terms
# Remove spacing and replace with a period, lowercase all letters



In [13]:
# Map age units to years, based on code specified here: https://open.fda.gov/apis/drug/event/searchable-fields/
def convert_to_years(age, unit):
    if pd.isna(unit):  # Check if value is NaN/None
        return np.nan
    elif unit == 800:  # Decade
        return age * 10
    elif unit == 801:  # Year
        return age
    elif unit == 802:  # Month
        return age / 12
    elif unit == 803:  # Week
        return age / 52
    elif unit == 804:  # Day
        return age / 365
    elif unit == 805:  # Hour
        return age / (365 * 24)
    else:
        return np.nan  # Return NaN for unknown units

In [14]:
# Return boxplot of character length for object columns, as well as descriptive statistics of character length
def plot_character_length(df, df_name):
    #filter for object columns
    documents_table_object_cols = df.select_dtypes(include=['object'])

    # Calculate the number of characters in each column
    character_counts = documents_table_object_cols.applymap(lambda x: len(str(x)))

    # Generate boxplot
    plt.figure(figsize=(10,8))
    sns.boxplot(data=character_counts, color='hotpink', orient = 'h')
    plt.title(f'Number of Characters in Each Object Column - {df_name}')
    plt.ylabel('Column')
    plt.xlabel('Number of Characters')
    plt.grid(True)
    plt.show()

    # Statistics Table
    stats_table = character_counts.describe().transpose()
    print("\nDescriptive Statistics on Character Length:")
    print(stats_table)

In [15]:
#function for detecting upper outliers
def examine_text_outliers(series):

    # acquire the mean and standard deviation of string lengths
    mean_length = series.str.len().mean()
    std_length = series.str.len().std()

    # calculate upper bound for outlier detection
    upper_bound = mean_length + 2 * std_length 

    # identify rows with string lengths above the upper bound
    upper_length_outliers = series[series.str.len() > upper_bound]

    return upper_length_outliers


In [6]:
def process_company_text(text):
    import re
    import string
    import pandas as pd
    extra_abv = ['nldsp', 'usasp', 'company', 'bax', 'spo',
                'ccaza', 'cinry', 'and', 'cansp', 'oxyc',
            'scpr', 'gbrct', 'gbrsp', 'tjp', 'unk',
            'frasp', 'brasp', 'sol', 'cbst','pmco',
            'jpnct', 'frua', 'espct', 'pre',
            'dsu', 'gmbh', 'dse', 'belsp', 'crisp',
            'kdl', 'irlsp', 'mpi', 'avee', 'usani', 
            'sun', 'belct', 'itasp', 'hkgsp', 'argsp',
                'aegr']

    country_codes_df = pd.read_csv('../DataLibrary/DynamicReferenceCSVs/country_codes_clean.csv')
    country_codes = country_codes_df['codes'].tolist()
    if isinstance(text, str):
        # Remove punctuation and replace with a space
        text = re.sub(f'[{string.punctuation}]', ' ', text)
        
        # Remove all numerical characters and replce with a space
        text = re.sub(r'\d+', ' ', text)
    
        # Tokenize words on whitespace
        tokens = text.split()
    
        # Convert all characters to lowercase
        tokens = [token.lower() for token in tokens]
    
        # Only retain tokens with 3 or more characters and remove 2-character country codes
        tokens = [token for token in tokens if len(token) > 2 
                  and token not in country_codes
                 and token not in extra_abv]

        # Update public health reporting entity labels
        token_replacements = {"phhy": "pubhosp", "pheh": "pubhosp", "phho": "pubhosp", "phfr": "pubhosp"}
        tokens = [token_replacements.get(token, token) for token in tokens]

        # Update long manufacturer names
    
        # Replace entire token list if it contains "ridgefield"
        if 'ridgefield' in tokens:
            tokens = ['bi', 'pharmaceuticals']

        # Replace entire token list if it contains both "ge" and "healthcare"
        if 'ge' in tokens and 'healthcare' in tokens:
            tokens = ['ge', 'healthcare']

        # Alexion pharma inc.
        if 'alexion' in tokens:
            tokens = ['alexion']

        # Assign pd.NA if the token list is empty
        if not tokens:
            return pd.NA
    
        return tokens

In [17]:
# Function to clean manufacturer text 
def clean_manufacturer_text(text_list):
    import re
    
    if not text_list or not isinstance(text_list, list):
        return pd.NA

    cleaned_tokens = []
    for text in text_list:
        if isinstance(text, str):
            # Remove the words "inc" and "llc" ignoring case
            text = re.sub(r'\b(?:inc|llc|ltd|lp|corp|usa)\b', '', text, flags=re.IGNORECASE)

            # Remove all punctuation except commas and replace with spaces
            text = re.sub(f"[{re.escape(string.punctuation.replace(',', ''))}]", ' ', text)
            
            # Remove any instances of two commas in a row, and replace with just a single comma
            text = re.sub(r',+', ',', text)

            # Remove all spaces and replace with dashes
            text = text.replace(' ', '-')

            # Remove any instances of two or more dashes in a row, and replace with just a single dash
            text = re.sub(r'-+', '-', text)

            # Tokenize text by splitting on commas
            tokens = text.split(',')

            # Convert all tokens to lowercase
            tokens = [token.lower().strip() for token in tokens]

            # Append tokens to cleaned_tokens list
            cleaned_tokens.extend(tokens)

    # Remove any empty strings from the list and remaining dashes
    cleaned_tokens = [token for token in cleaned_tokens if token and not re.fullmatch(r'-+', token)]

    # Remove any country code or location strings

    # Update long names to standardized abbreviations dictionary

    # If the resulting token list is empty, assign pd.NA
    if not cleaned_tokens:
        return pd.NA

    return cleaned_tokens

In [16]:
def descriptive_stats(tokens, top_n=5, verbose=True):
    from collections import Counter
    # Flatten the list of tokens and filter out any float values
    flat_tokens = [token for token in tokens if not isinstance(token, float)]
    
    # Calculate the total number of tokens
    total_tokens = len(flat_tokens)
    
    # Calculate the number of unique tokens
    num_unique_tokens = len(set(flat_tokens))
    
    # Calculate lexical diversity
    lexical_diversity = num_unique_tokens / total_tokens if total_tokens > 0 else 0
    
    # Calculate the number of characters
    num_characters = sum(len(token) for token in flat_tokens)
    
    # Calculate the average token length
    avg_token_length = num_characters / total_tokens if total_tokens > 0 else 0
    
    # Calculate token length variance
    token_lengths = [len(token) for token in flat_tokens]
    token_length_variance = np.var(token_lengths)
    
    # Calculate token length standard deviation
    token_length_std_dev = np.std(token_lengths)
    
    # Find the most common tokens
    most_common_tokens = Counter(flat_tokens).most_common(top_n)
    
    if verbose:
        print(f"There are {total_tokens} tokens in the data.")
        print(f"There are {num_unique_tokens} unique tokens in the data.")
        print(f"There are {num_characters} characters in the data.")
        print(f"The lexical diversity is {lexical_diversity:.3f} in the data.")
        print(f"The average token length is {avg_token_length:.3f} in the data.")
        print(f"The variance of token lengths is {token_length_variance:.3f} in the data.")
        print(f"The standard deviation of token lengths is {token_length_std_dev:.3f} in the data.")
        print(f"The {top_n} most common tokens are {most_common_tokens} in the data.")
    
    return [
        total_tokens,
        num_unique_tokens,
        lexical_diversity,
        num_characters,
        avg_token_length,
        token_length_variance,
        token_length_std_dev,
        most_common_tokens
    ]

In [19]:
def calculate_descriptives(column):
    # Calculate mean
    mean_value = column.mean()
    
    # Calculate standard deviation
    std_deviation = column.std()
    
    # Calculate variance
    variance_value = column.var()
    
    # Calculate skewness
    skewness_value = skew(column)
    
    return mean_value, std_deviation, variance_value, skewness_value

# Example usage:
# Create a sample DataFrame
data = {
    'numbers': [2, 4, 6, 8, 10]
}
df = pd.DataFrame(data)

# Calculate descriptives for the 'numbers' column
mean_val, std_dev, var, skewness = calculate_descriptives(df['numbers'])

# Print the results
print(f"Mean: {mean_val}")
print(f"Standard Deviation: {std_dev}")
print(f"Variance: {var}")
print(f"Skewness: {skewness}")


Mean: 6.0
Standard Deviation: 3.1622776601683795
Variance: 10.0
Skewness: 0.0


In [20]:
def calculate_descriptives(column):
    """Calculate descriptives for numerical column inputs"""
    median_value = column.median()

    mean_value = column.mean()
        
    std_deviation = column.std()
    
    variance_value = column.var()
    
    skewness_value = skew(column)
    
    q1 = column.quantile(0.25)
    
    q3 = column.quantile(0.75)
    
    # Prepare data for tabulation
    headers = ['Statistic', 'Value']
    data = [
        ['Median (Md)', median_value],
        ['Mean (x-bar)', mean_value],
        ['Standard Deviation (s)', std_deviation],
        ['Variance (s2)', variance_value],
        ['Skewness', skewness_value],
        ['First Quartile (Q1)', q1],
        ['Third Quartile (Q3)', q3]
    ]
    
    # Print the table
    print(tabulate(data, headers=headers, tablefmt='grid'))

In [4]:
# drugs are currently in list format, clean to work with
def clean_data(x):
    if isinstance(x, list):
        return ','.join(x)
    elif isinstance(x, str):
        return x  # Handle strings as needed
    else:
        return x  # Handle other types as needed

In [5]:
# match text
def contains_unique_value(text, unique_values_lower):
    matched_texts = []
    text_lower = text.lower()
    for value in unique_values_lower:
        if value in text_lower:
            matched_texts.append(value)
    return matched_texts

In [23]:
# Function to convert list to concatenated string
def list_to_string(lst):
    if isinstance(lst, list):
        return ', '.join(lst)
    else:
        return lst  # Handle non-list values if any

In [7]:
def process_and_normalize_drugs(df_chunk, df2_synonyms, df2_names):
    def drug_name_norms(row):
        med_product_list = str(row['med_product']).split(' \, ')
        med_product_list = list(set(med_product_list))  # unique values
        synonyms_list = list(set(' '.join(df2_synonyms).split(' \| ')))  # unique values
        match = process.extractOne(' '.join(med_product_list), synonyms_list)
        if match and match[1] > 90:
            original_row_index = df2_synonyms[df2_synonyms.str.contains(match[0], regex=False)].index
            if not original_row_index.empty:
                return df2_names.iloc[original_row_index[0]]
        return None

    df_chunk['drug_name_norm'] = df_chunk.apply(drug_name_norms, axis=1)
    return df_chunk

def parallel_worker(chunk, func, args):
    return func(chunk, *args)

def apply_parallel(df, func, args):
    df_split = np.array_split(df, mp.cpu_count())
    with mp.Pool(mp.cpu_count()) as pool:
        result = pd.concat(pool.starmap(parallel_worker, [(chunk, func, args) for chunk in df_split]))
    return result

#if __name__ == '__main__':

#    start_time = time.time()

    # Apply the parallel processing
#    result_table = apply_parallel(drugs_test, process_and_normalize_drugs, (adrecs_drugs['DRUG_SYNONYMS'], adrecs_drugs['DRUG_NAME']))

    # Combine the result into a single column in the original table
#    drugs_test['drug_name_norm'] = result_table['drug_name_norm']

#    end_time = time.time()
#    execution_time = end_time - start_time
#    print(f"Execution time: {execution_time} seconds")

In [10]:
def find_matches(list1, list2):

     # Convert inner lists to tuples to make them hashable
    set1 = set(tuple(x) if isinstance(x, list) else x for x in list1)
    set2 = set(tuple(x) if isinstance(x, list) else x for x in list2)
    
    # Find the intersection of the two sets
    common_elements = set1.intersection(set2)
    
    # Convert the set back to a list
    return list(common_elements)

#list1 = manus_table['set_id'].tolist()
#list2 = labels_test['set_id'].tolist()

#matches = find_matches(list1, list2)
#matches

In [9]:
# Time it
#start_time = time.time()

# Create reverse mapping from set_id to ndc
#reverse_mapping = {}
#for ndc, set_ids in manu_ndc_set_dict.items():
#    if isinstance(set_ids, list):
#        for set_id in set_ids:
#            reverse_mapping[set_id] = ndc

# Function to fill NaN values in 'ndc' based on 'set_id'
def fillna_ndc_with_setid(row, reverse_dict):
    if pd.isna(row['ndc']) and row['set_id'] in reverse_dict:
        return reverse_dict[row['set_id']]
    return row['ndc']

# Apply the function to fill NaN values in 'ndc'
#labels_test['ndc'] = labels_test.apply(lambda row: fillna_ndc_with_setid(row, reverse_mapping), axis = 1)

In [14]:
def clean_output_list(output_list):
    # List of specific values to be removed
    values_to_remove = [
        "mg", "hr", "xxx", "aabc", "aacg", "aafi", "aand", "aaty", "aazg", 
        "acne", "acre", "acid", "acting", "action", "activated", "adult", 
        "advanced", "adventure", "aged", "agent", "aging", "american", 
        "anal", "anti", "area", "armpit", "asian", "assorted", "athlete", 
        "baby", 'first', 'treatment', 'infection', 'normal','based',
        'natural', 'heart', 'kidney', 'liver', 'control', 'effective',
        'cation', 'scent', 'sugar', 'solution', 'control', 'delay',
        'clean', 'white', 'cough', 'effect', 'normal', 'fresh', 
        'health', 'human', 'sweet', 'clear', 'effect', 'improve', 
        'powder', 'daily', 'release', 'quick', 'horse', 'muscle', 
        'sensitive', 'supplement', 'diabetic', 'diabetes', 'throat',
        'blood', 'major', 'inc', 'support', 'level', 'whole', 
        'michigan', 'clotting', 'severe', 'protection', 'capsule', 
        'weight', 'height', 'professional', 'management', 'trigger',
        'system', 'nerve', 'cattle', 'medicated', 'joint', 'complete', 
        'foundation', 'special', 'constipation', 'sting', 'influenza',
        'virus', 'guard', 'strength', 'tendon', 'injection', 'thick',
        'correct', 'deliver', 'pediatric', 'product', 'serious', 
        'pressure', 'relief', 'black', 'essential', 'ultimate', 
        'laboratory', 'safety', 'break', 'brain', 'official', 
        'refill', 'light', 'protect', 'medication', 'deliver', 
        'recovery', 'headache', 'immune', 'since', 'protect', 
        'model', 'allergy', 'smoking', 'leader', 'donor', 'direct',
        'stomach', 'intestine', 'dietary', 'growth', 'order',
        'supplemen', 'medical', 'protects', 'treat', 'clinical', 'prevent', 'common', 'table', 'symptom', 'treated', 'therapy', 'inspec',
    'multi', 'active', 'child', 'ether', 'menta', 'ml', 'viola', 'small', 'equate', 'function', 'plant', 'tissue', 'cover',
    'fever', 'perio', 'total', 'protein', 'ester', 'young', 'infected', 'factor', 'tablet', 'california', 'enhance', 'labeled',
    'discus', 'water', 'surgical', 'house', 'inflammation', 'higher', 'salmon', 'breath', 'microbial', 'sanitary', 'broad',
    'breast', 'infant', 'inhibitor', 'family', 'mouth', 'chest', 'salmonella', 'bacterial', 'lymph', 'antibiotic', 'complex',
    'arthritis', 'right', 'formula', 'antimicrobial', 'fluid', 'education', 'elder', 'surface', 'hormone', 'pulmo', 'gluco',
    'cheese', 'removal', 'urinary', 'super', 'exposed', 'routine', 'cholesterol', 'equivalent', 'double', 'marrow', 'compound',
    'purpose', 'forming', 'listeria', 'ribes', 'physical', 'original', 'sterile', 'series', 'dressing', 'shell', 'unique',
    'gland', 'breathing', 'vessel', 'value', 'extended', 'extra', 'liquid', 'monocytogenes', 'borate', 'sleep', 'injectable',
    'toxin', 'finish', 'receptor', 'chloride', 'filled', 'external', 'regio', 'moving', 'female', 'arteria', 'energy', 'motion',
    'sodium', 'balance', 'artery', 'sport', 'frozen', 'fighting', 'reproductive', 'ultra', 'rheum', 'candida', 'fruit', 'rheuma',
    'indian', 'comfort', 'flush', 'restore', 'bladder', 'cutaneous', 'xtreme', 'travel', 'little', 'witch', 'minor', 'hepatitis',
    'renes', 'collection', 'alcohol', 'aries', 'colon', 'prostate', 'paste', 'resistance', 'vitamin', 'continuous', 'relieve',
    'hydrochloride', 'analgesic', 'european', 'fungal', 'linum', 'botulinum', 'fluor', 'coccus', 'english', 'silver', 'frida',
    'pancreas', 'extreme', 'night', 'rheumatoid', 'nasal', 'calcium', 'dental', 'photo', 'delayed', 'genital', 'green',
    'clostridium', 'elaps', 'damaged', 'johnson', 'solid', 'tuber', 'stick', 'element', 'inhalation', 'cream', 'oxygen',
    'poison', 'chill', 'derived', 'xygen', 'abies', 'woman', 'defense', 'antibacterial', 'hydration', 'spf', 'sinus', 'repair',
    'vaginal', 'orally', 'fatty', 'sweat', 'spring', 'sulfate', 'juice', 'transparent', 'stool', 'sulfat', 'rectal', 'touch',
    'stress', 'german', 'vegetable', 'topica', 'ethyl', 'companion', 'smooth', 'topical', 'serum', 'maximum', 'blast', 'spray',
    'digestive', 'recombinant', 'carbo', 'coagulation', 'northern', 'beta', 'artificial', 'tarte', 'lesion', 'staphylococcus',
    'aureus', 'crushed', 'thyroid', 'grain', 'preparation', 'potassium', 'mexican', 'releasing', 'paris', 'organic', 'portable',
    'bayer', 'mineral', 'lymphocyte', 'china', 'dissolve', 'ntric', 'oregon', 'chamber', 'acetate', 'waste', 'wound', 'methyl',
    'extract', 'apple', 'ophth', 'protective', 'chicken', 'enhancement', 'yellow', 'oyster', 'antigen', 'herbal', 'herba',
    'santa', 'refrigerated', 'synthetic', 'compressed', 'forest', 'dermal', 'silicon', 'nitrate', 'spleen', 'cleaning',
    'superior', 'mixture', 'stimulation', 'interferon', 'flavor', 'pepper', 'simple', 'piece', 'fiber', 'pellet', 'strengthening',
    'circulatory', 'ocean', 'esophagus', 'antifungal', 'vulva', 'butter', 'rescue', 'standardized', 'violet', 'semen', 'giant',
    'morning', 'streptococcus', 'enlarged', 'stimulating', 'healing', 'shield', 'phosphate', 'omega', 'platinum', 'powered',
    'coffee', 'strip', 'alpha', 'collagen', 'fresco', 'prime', 'mountain', 'berry', 'peanut', 'ketone', 'burning', 'pectin',
    'victoria', 'suspension', 'donna', 'neuropathy', 'thirty', 'medulla', 'lactic', 'rectum', 'simply', 'pacific', 'intensive',
    'spore', 'tinct', 'diarrheal', 'gastric', 'membrane', 'primer', 'mucosa', 'purified', 'remedy', 'yeast', 'campylobacter',
    'sticker', 'kentucky', 'uterus', 'pseudo', 'summer', 'clarifying', 'candy', 'patch', 'longa', 'firming', 'hydrate',
    'osteoarthritis', 'tooth', 'stimulant', 'everyday', 'lymphatic', 'steam', 'defining', 'choline', 'peptide', 'reliever',
    'magnesium', 'proof', 'clearing', 'sanitizer', 'montana', 'holly', 'concentrate', 'esium', 'spectrum', 'creme', 'papaya',
    'flavored', 'alfalfa', 'escherichia', 'carbohydrate', 'lingual', 'antiseptic', 'adrenal', 'iodine', 'balanced', 'ophthalmic',
    'herpes', 'tartrate', 'immunomodulator', 'ginger', 'pneumoniae', 'relieving', 'brown', 'tanning', 'plantar', 'carbon',
    'ozone', 'clover', 'dermatitis', 'clove', 'brucella', 'soluble', 'solub', 'regener', 'beauty', 'slimming', 'bruise', 'coated',
    'oxide', 'hydrogel', 'tropical', 'caffeine', 'applicator', 'honey', 'pollen', 'aeruginosa', 'pseudomonas', 'patent',
    'immature', 'bovine', 'protease', 'instant', 'roasted', 'aerosol', 'soybean', 'twist', 'orange', 'gonadotropin', 'depleted',
    'scalp', 'stuffy', 'papilloma', 'succinate', 'barrier', 'cultured', 'cysteine', 'mycobacterium', 'mucus', 'sanitizing',
    'aspart', 'coagulant', 'polymer', 'cerebral', 'glucagon', 'radish', 'ovine', 'bella', 'cardiaca', 'balancing', 'cloth',
    'coconut', 'sheep', 'shade', 'homeopathic', 'bacillus', 'ammonia', 'fumarate', 'posterior', 'gluten', 'acetyl', 'umbilical',
    'packet', 'enterococcus', 'facial', 'citrate', 'sunscreen', 'prenatal', 'wheat', 'hepar', 'liner', 'prostatic', 'preservative',
    'antacid', 'daytime', 'marine', 'staining', 'autologous', 'sponge', 'hepatica', 'dimethyl', 'difficile', 'lupus', 'salix',
    'electrolyte', 'culta', 'guinea', 'grape', 'typhi', 'salve', 'yersinia', 'sterilized', 'drainage', 'wellness', 'phenol',
    'chocolate', 'rabbit', 'serotonin', 'hemorrhoid', 'eczema', 'nitri', 'magic', 'sirolimus', 'bitartrate', 'teeth', 'pineapple',
    'shingle', 'aspartate', 'retinal', 'medicinal', 'psoriasis', 'square', 'menstrual', 'congestion', 'medium', 'ovaria',
    'cartilage', 'almond', 'pertussis', 'lotion', 'allogeneic', 'interleukin', 'convenience', 'syrup', 'cypress', 'peach', 'vivus',
    'amara', 'lipase', 'narcotic', 'triple', 'sunshine', 'gluconate', 'plane', 'timothy', 'drowsy', 'parvovirus', 'cortisol',
    'combo', 'sputum', 'rinse', 'liquida', 'repository', 'monkey', 'lemon', 'glycol', 'hemorrhoidal', 'angiotensin', 'porcine',
    'povidon', 'povidone', 'banana', 'faecalis', 'haemolyticus', 'fluoride', 'sulfur', 'musca', 'bubble', 'nickel', 'cobalt',
    'mexicana', 'chromium', 'canti', 'olismo', 'mesylate', 'probiotic', 'hydrogen', 'hydrogenated', 'detergent', 'technetium',
    'tilmanocept', 'shigella', 'crystal', 'mango', 'sacred', 'varicella', 'zoster', 'shampoo', 'outdoor', 'breakout', 'anticoagulant',
    'sickness', 'bordetella', 'caring', 'brazil', 'barley', 'tomato', 'generator', 'deodorant', 'vitis', 'americana', 'dalteparin',
    'fifty', 'urethra', 'cytomegalovirus', 'eardrop', 'glandula', 'caribbean', 'dioxide', 'glandular', 'tonsil', 'estradiol',
    'genuine', 'anesthetic', 'fungus', 'chorionic', 'rubber', 'shaping', 'envelope', 'bandage', 'creamy', 'inactivated', 'emulsion',
    'haemophilus', 'enhancer', 'broom', 'pyogenes', 'benefiting', 'chewable', 'gallbladder', 'cashew', 'klebsiella', 'clidinium',
    'influenzae', 'hispana', 'handwashing', 'collagenase', 'handwash', 'leafy', 'erection', 'solucion', 'disinfecting', 'basil',
    'hazel', 'olive', 'titanium', 'detox', 'renew', 'placenta', 'cleanse', 'bethlehem', 'globulin', 'secretion', 'hurricane',
    'phoenix', 'animale', 'hydrolyzed', 'jelly', 'nighttime', 'tincture', 'burst', 'traveler', 'transdermal', 'skeleton', 'diethyl',
    'stigma', 'smoothie', 'disinfectant', 'glycero', 'glycerol', 'lubricant', 'helicobacter', 'shake', 'saccharate', 'feminine',
    'redwood', 'butyrate', 'phenyl', 'exoskeleton', 'blueberry', 'eyewash', 'granule', 'finasteride', 'moisturizer', 'camphor',
    'nitrogen', 'resin', 'disodium', 'alginate', 'bilberry', 'diminishing', 'gallium', 'meniscus', 'chestnut', 'disintegrating',
    'minus', 'wrinkle', 'mover', 'remover', 'toner', 'shark', 'premium', 'fibrocartilage', 'allium', 'tauri', 'taurine', 'bromide',
    'colloid', 'alliu', 'phosphorous', 'venom', 'swimmer', 'furoate', 'lemonade', 'bubblegum', 'raspberry', 'fibroblast',
    'keratinocytes', 'tazobactam', 'plumb', 'alder', 'concentrated', 'coronavirus', 'mycoplasma', 'stranded', 'nhn', 'indole',
    'canis', 'cobra', 'icatibant', 'brava', 'cereus', 'propyl', 'propylene', 'hyaluronic', 'palladium', 'anorectal', 'epstein',
    'cholera', 'polidocanol', 'tacrolimus', 'wiesbaden', 'varicose', 'anticavity', 'xanthine', 'blanco', 'mouthwash', 'urethral',
    'vernal', 'belly', 'orchard', 'grass', 'burnetii', 'coxiella', 'cacao', 'comfrey', 'chickweed', 'simplex', 'corros',
    'linezolid', 'maple', 'whitening', 'aspen', 'epidermis', 'follicle', 'mucor', 'eliglustat', 'calcitonin', 'phosphorus',
    'chlamydia', 'bisulfate', 'tetanus', 'booster', 'tropicalis', 'krusei', 'parapsilosis', 'albicans', 'glabra', 'mirabegron',
    'pegaspargase', 'chrysanthemi', 'erwinia', 'asparaginase', 'asparagine', 'croton', 'grapefruit', 'adrenalin', 'boosting',
    'cassia', 'angustifolia', 'gondii', 'toxoplasma', 'fermented', 'sulph', 'sulphur', 'lantern', 'pomegranate', 'antioxidant',
    'alanine', 'melatonin', 'maleate', 'polyvinyl', 'glargine', 'glutamine', 'helium', 'derivative', 'reducer', 'tcm', 'cooling',
    'cedar', 'chlorine', 'bleach', 'miracle', 'citrus', 'enteriditis', 'asparagus', 'cyanide', 'nitrite', 'paint', 'ovary',
    'ribose', 'rickettsia', 'benzyl', 'spinosad', 'spinosa', 'folate', 'folic', 'vulgaris', 'embryo', 'immunoglobulin', 'pituitary',
    'propionate', 'saffron', 'garlic', 'mushroom', 'cucumber', 'chlorothiazide', 'malvin', 'quinine', 'passion', 'chelate',
    'detoxifying', 'massage', 'classic', 'vardenafil', 'tattoo', 'carboplatin', 'apricot', 'cinnamon', 'polysaccharide', 'foaming',
    'protectant', 'calfactant', 'morgan', 'palmitate', 'fingolimod', 'tigan', 'ticagrelor', 'copper', 'octyl', 'sipuleucel',
    'borage', 'nitric', 'geranium', 'strawberry', 'melon', 'epithelium', 'bimatoprost', 'premier', 'curve', 'murine', 'glucosamine',
    'teething', 'belladonna', 'cortex', 'atropine', 'toothpaste', 'sturgeon', 'resurfacing', 'cellulose', 'gentamicin', 'gamma',
    'oleum', 'petroleum', 'earwax', 'picosulfate', 'bisacodyl', 'citric', 'malus', 'midodrine', 'duodenum', 'albumin', 'album',
    'meglumine', 'rosewood', 'hawthorn', 'perfect', 'overnight', 'cryptosporidium', 'giardia', 'speciosa', 'refresh', 'ointment',
    'mulberry', 'coleus', 'bitter', 'lotus', 'marum', 'foenum', 'trigonella', 'hexafluoride', 'microspheres', 'radium', 'ibandronate',
    'scripta', 'walnut', 'dandelion', 'artemisiifolia', 'ambrosia', 'roflumilast', 'butal'     
    ]
    
    # Remove numbers and specific values from each string in the list
    cleaned_list = []
    for item in output_list:
        # Remove all numbers
        item_no_numbers = re.sub(r'\d+', '', item)
        # Remove specific values
        if item_no_numbers not in values_to_remove:
            cleaned_list.append(item_no_numbers)
    
    return cleaned_list

In [2]:
def find_matching_drugs(text, unique_drug_names):
    matched_drugs = []
    for drug in unique_drug_names:
        if drug.lower() in text.lower():
            matched_drugs.append(drug)
    return matched_drugs

In [3]:
# Function to check for matches between text and unique_reactions
def check_reactions(text, unique_reactions):
    text_lower = text.lower()
    matches = [1 if reaction.lower() in text_lower else 0 for reaction in unique_reactions]
    return matches

In [6]:
#import pandas as pd
#import json
#import re
#import logging

# Set up basic configuration for logging
#logging.basicConfig(level=logging.ERROR)

#def clean_json_string(json_str):
#    if json_str is None:
#        return None
#    # Replace single quotes with double quotes
#    json_str = json_str.replace("'", '"')
#    
#    # Remove trailing commas before closing brackets
#    json_str = re.sub(r',(\s*[\]}])', r'\1', json_str)
#    
#    # Remove extra commas within lists
#    json_str = re.sub(r',(\s*[\]])', r'\1', json_str)
#    
#    # Ensure that escaped quotes are correctly formatted
#    json_str = re.sub(r'\\\"', '"', json_str)
#    
#    return json_str

#def safely_parse_json(json_str):
#    if json_str is None:
#        return None
#    try:
#        json_str = clean_json_string(json_str)
#        return json.loads(json_str)
#    except json.JSONDecodeError as e:
#        logging.error(f"Error parsing JSON: {e}")
#        snippet = json_str[:1000]
#        logging.error(f"Problematic JSON snippet: {snippet}...")
#        return None

#def flatten_dict(d, parent_key='', sep='_'):
#    items = []
#    for k, v in d.items():
#        new_key = f"{parent_key}{k}" if parent_key == '' else f"{parent_key}{sep}{k}"
#        if isinstance(v, dict):
#            items.extend(flatten_dict(v, new_key, sep=sep).items())
#        elif isinstance(v, list):
#            for i, item in enumerate(v):
#                if isinstance(item, dict):
#                    items.extend(flatten_dict(item, f"{new_key}{sep}{i}", sep=sep).items())
#                else:
#                    items.append((f"{new_key}{sep}{i}", item))
#        else:
#            items.append((new_key, v))
#    return dict(items)

#def extract_and_flatten_drug(row):
#    event_id = row['event_id']
#    drug_entries = row['drug']
#    
#    if not isinstance(drug_entries, list):
#        return []
    
#    expanded_entries = []
#    for drug_entry in drug_entries:
#        if isinstance(drug_entry, dict):
#            flattened_entry = flatten_dict(drug_entry)
#            flattened_entry['event_id'] = event_id
#            expanded_entries.append(flattened_entry)
    
#    return expanded_entries

# Apply the JSON cleaning and parsing to the 'drug' column
#events_table_subset['drug'] = events_table_subset['drug'].apply(
#    lambda x: safely_parse_json(x) if isinstance(x, str) else x
#)

# Extract and flatten all columns from the 'drug' dictionary, keeping 'event_id'
#expanded_data = []
#for _, row in events_table_subset.iterrows():
#    entries = extract_and_flatten_drug(row)
#    expanded_data.extend(entries)

# Convert the list of expanded entries to a DataFrame
#patient_drugs_df = pd.DataFrame(expanded_data)

# Reorder columns to ensure 'event_id' is the first column
#if not patient_drugs_df.empty and 'event_id' in patient_drugs_df.columns:
    # Reorder columns with 'event_id' as the first column
#    columns = ['event_id'] + [col for col in patient_drugs_df.columns if col != 'event_id']
#    patient_drugs_df = patient_drugs_df[columns]

# Display the DataFrame
#print(patient_drugs_df.head())

In [7]:
#import pandas as pd
#import json
#import re
#import logging

# Set up basic configuration for logging
#logging.basicConfig(level=logging.ERROR)

#def clean_json_string(json_str):
#    if json_str is None:
#        return None
#    # Replace single quotes with double quotes
#    json_str = json_str.replace("'", '"')
    
    # Remove trailing commas before closing brackets
#    json_str = re.sub(r',(\s*[\]}])', r'\1', json_str)
    
    # Remove extra commas within lists
#    json_str = re.sub(r',(\s*[\]])', r'\1', json_str)
    
    # Ensure that escaped quotes are correctly formatted
#    json_str = re.sub(r'\\\"', '"', json_str)
    
#    return json_str

#def safely_parse_json(json_str):
#    if json_str is None:
#        return None
#    try:
#        json_str = clean_json_string(json_str)
#        return json.loads(json_str)
#    except json.JSONDecodeError as e:
#        logging.error(f"Error parsing JSON: {e}")
#        snippet = json_str[:1000]
#        logging.error(f"Problematic JSON snippet: {snippet}...")
#        return None

#def flatten_dict(d, parent_key='', sep='_'):
#    items = []
#    for k, v in d.items():
#        new_key = f"{parent_key}{k}" if parent_key == '' else f"{parent_key}{sep}{k}"
#        if isinstance(v, dict):
#            items.extend(flatten_dict(v, new_key, sep=sep).items())
#        elif isinstance(v, list):
#            for i, item in enumerate(v):
#                if isinstance(item, dict):
#                    items.extend(flatten_dict(item, f"{new_key}{sep}{i}", sep=sep).items())
#                else:
#                    items.append((f"{new_key}{sep}{i}", item))
#        else:
#            items.append((new_key, v))
#    return dict(items)

#def extract_and_flatten_reaction(row):
#    event_id = row['event_id']
#    reaction_entries = row['reaction']
    
#    if not isinstance(reaction_entries, list):
#        return []
    
#    expanded_entries = []
#    for reaction_entry in reaction_entries:
#        if isinstance(reaction_entry, dict):
#            flattened_entry = flatten_dict(reaction_entry)
#            flattened_entry['event_id'] = event_id
#            expanded_entries.append(flattened_entry)
    
#    return expanded_entries

# Apply the JSON cleaning and parsing to the 'reaction' column
#events_table_subset['reaction'] = events_table_subset['reaction'].apply(
#    lambda x: safely_parse_json(x) if isinstance(x, str) else x
#)

# Extract and flatten all columns from the 'reaction' dictionary, keeping 'event_id'
#expanded_data = []
#for _, row in events_table_subset.iterrows():
#    entries = extract_and_flatten_reaction(row)
#    expanded_data.extend(entries)

# Convert the list of expanded entries to a DataFrame
#patient_reactions_df = pd.DataFrame(expanded_data)

# Reorder columns to ensure 'event_id' is the first column
#if not patient_reactions_df.empty and 'event_id' in patient_reactions_df.columns:
    # Reorder columns with 'event_id' as the first column
#    columns = ['event_id'] + [col for col in patient_reactions_df.columns if col != 'event_id']
#    patient_reactions_df = patient_reactions_df[columns]

# Display the DataFrame
#print(patient_reactions_df.head())


In [2]:
def evaluate_model(model, X_test, y_test, labels, model_name):

    # Predict on the test data
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average=None, labels=labels)
    recall = recall_score(y_test, y_pred, average=None, labels=labels)
    f1 = f1_score(y_test, y_pred, average=None, labels=labels)
    
    # Compute confusion matrix
    cm = confusion_matrix(y_test, y_pred, labels=labels)
    
    # Calculate specificity for each class
    specificity = np.array([
        (np.sum(cm) - np.sum(cm[:, i]) - np.sum(cm[i, :]) + cm[i, i]) / (np.sum(cm) - np.sum(cm[:, i]))
        for i in range(len(labels))
    ])
    
    # Round metrics to three decimal places
    accuracy = round(accuracy, 3)
    precision = np.round(precision, 3)
    recall = np.round(recall, 3)
    f1 = np.round(f1, 3)
    specificity = np.round(specificity, 3)
    
    # Prepare data for DataFrame
    rows = []
    for i, label in enumerate(labels):
        rows.append({
            'model': model_name,
            'class': label,
            'accuracy': accuracy,
            'precision': precision[i],
            'recall': recall[i],
            'f1_score': f1[i],
            'specificity': specificity[i]
        })
    
    # Create DataFrame
    metrics_df = pd.DataFrame(rows)
    
    # Create the ConfusionMatrices directory if it does not exist
    folder_path = '../ImageLibrary/ConfusionMatrices'
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    
    # Store the confusion matrix heatmap image
    img_path = os.path.join(folder_path, f'confusion_matrix_{model_name}.png')
    
    # Define axis labels
    axis_labels = ['Not Serious', 'Serious', 'Death']  # Replace with actual labels if different
    
    # Create the heatmap
    plt.figure(figsize=(6, 4))
    ax = sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                     xticklabels=axis_labels, yticklabels=axis_labels,
                     annot_kws={"size": 11})
    
    # Set labels and title with larger font sizes
    plt.xlabel('Predicted Label', fontsize=14)
    plt.ylabel('True Label', fontsize=14)
    plt.title(f'Classification Matrix Heatmap for {model_name}', fontsize=16)
    
    # Adjust the size of the tick labels
    ax.set_xticklabels(ax.get_xticklabels(), fontsize=12)  # Adjust x-tick labels font size
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=12)  # Adjust y-tick labels font size
    
    plt.tight_layout()
    
    # Save the plot to an image file
    plt.savefig(img_path)
    plt.close()  # Close the plot to free up memory
    
    return metrics_df, img_path, y_pred

In [4]:
def get_model_coefficients_and_odds(data, model_name, top_n=5):
    outcomes = ['Non-Serious', 'Serious', 'Death']
    result = {}

    for i, outcome in enumerate(outcomes):
        # Intercept
        intercept_data = {'Feature': ['Intercept'], 'LogOdds': [model_name.intercept_[i]]}
        intercept_df = pd.DataFrame(intercept_data)
        intercept_df['Odds'] = np.exp(intercept_df['LogOdds'])

        # Coefficients
        coef_data = {'Feature': data.columns, 'LogOdds': model_name.coef_[i]}
        coef_df = pd.DataFrame(coef_data)
        coef_df = coef_df.reindex(coef_df['LogOdds'].abs().sort_values(ascending=False).index)
        coef_df['Odds'] = np.exp(coef_df['LogOdds'])

        # Combine intercept and coefficients
        df = pd.concat([intercept_df, coef_df])
        
        result[outcome] = df.head(top_n + 1)

    return result

In [2]:
input_ndc_list = [2405911,  # Dupixent
                  502420150,  # Ocrevus
                  595720402,  # Revlimid
                  684620226]  # Ezetimibe
drug_names = ['Dupixent', 'Ocrevus', 'Revlimid', 'Ezetimibe']
nums = ['age', 'weight', 'unit_price']
cats = ['sex', 'report_source', 'serious_outcome']

def top_drug_comparison(input_ndc_list):
    means = []
    sds = []
    counts = []

    for drug in input_ndc_list:
        drug_df = master_query_df.loc[master_query_df['ndc9'] == drug]
        
        # Calculate means and standard deviations for numeric columns
        mean_values = drug_df[nums].mean()
        sd_values = drug_df[nums].std()
        
        # Append to the list of means and standard deviations
        means.append(mean_values)
        sds.append(sd_values)
        
        # Calculate counts for categorical columns
        count_values = {}
        for col in cats:
            value_counts = drug_df[col].value_counts(normalize=True).round(decimals=2)
            # Sort by frequency (value) first, and by category (key) in case of ties
            sorted_counts = dict(sorted(value_counts.items(), key=lambda item: (item[1], item[0])))
            count_values[col] = sorted_counts
        counts.append(pd.DataFrame.from_dict(count_values, orient='index').T)
        
    # Convert lists to DataFrames
    means_df = pd.DataFrame(means, index=input_ndc_list)
    sds_df = pd.DataFrame(sds, index=input_ndc_list)
    num_desc = pd.concat([means_df, sds_df], axis=1, keys=['Mean', 'StdDev'])
    num_desc['DrugNames'] = drug_names
    
    counts_df = pd.concat(counts, keys=input_ndc_list)
    
     # Add drug name labels to counts_df
    counts_df['DrugName'] = counts_df.index.get_level_values(0).map(dict(zip(input_ndc_list, drug_names)))
    
    # Reset index to make drug name a column
    counts_df.reset_index(inplace=True)
    
    # Reorder columns so 'DrugName' comes first
    cols = ['DrugName'] + [col for col in counts_df.columns if col != 'DrugName']
    counts_df = counts_df[cols]
    
    return num_desc, counts_df

In [6]:
def display_model_metrics_table(metrics_dfs):
    # Concatenate all DataFrames
    combined_df = pd.concat(metrics_dfs, ignore_index=True)
    
    # Convert DataFrame to a list of lists for tabulate
    table = combined_df.values.tolist()
    
    # Get column headers from DataFrame
    headers = combined_df.columns.tolist()
    
    # Generate and print the table
    table_str = tabulate(table, headers, tablefmt='pretty')
    print(table_str)
    return combined_df

#model_metrics = display_model_metrics_table([baseline_metrics_df,log_l1_metrics_df, log_l2_metrics_df, elastic_net_metrics_df,
#                           tree1_metrics_df, rf_metrics_df, knn_metrics_df, grboost_metrics_df])