In [15]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from nltk.corpus import stopwords
import string
import pandas as pd
from string import punctuation
punctuation = set(punctuation)
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np



def process_text(text):
    '''Function to process text fields.
    Involves removing punctuation, tokenizing text, removing stopwords, lemmatizing tokens, folding to lowercase.'''
    # Define punctuation set
    punctuation = set(string.punctuation)
    # Add additional punctuation character
    additional_punctuation = {'‘', '—', '“', '«'}
    punctuation.update(additional_punctuation)

    # Tokenize text using NLTK's word_tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords and punctuation, and lemmatize tokens
    sw = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    tokens = [
        lemmatizer.lemmatize(token.lower())
        for token in tokens
        if token.lower() not in sw and all(char not in punctuation for char in token)
        # remove numerical tokens   
        and not token.isdigit()
        # remove tokens with just one character
        and len(token) > 1 
    ]
    
    return tokens

In [2]:
# process text with 2-token n-grams for language context
def process_text_grams(text):
    # Define punctuation set
    punctuation = set(string.punctuation)
    # Add additional punctuation character
    punctuation.update({'‘'})

    # Tokenize text using NLTK's word_tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords and punctuation, and lemmatize tokens
    sw = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    tokens = [
        lemmatizer.lemmatize(token.lower())
        for token in tokens
        if token.lower() not in sw and all(char not in punctuation for char in token)
    ]
    
    # Generate bigrams
    bigrams = list(ngrams(tokens, 2))
    
    # Combine tokens and bigrams into one list
    combined_tokens = tokens + [' '.join(bigram) for bigram in bigrams]
    
    return combined_tokens

In [3]:
# Define the text processing function for the drug labels specifically
def process_label_text(text):
    if isinstance(text, str):  # Check if the input is a string
        # Define punctuation set
        punctuation = set(string.punctuation)
        
        # Replace punctuation with spaces
        for p in punctuation:
            text = text.replace(p, ' ')
        
        # Tokenize text using NLTK's word_tokenize
        tokens = word_tokenize(text)

        # Remove stopwords and lemmatize tokens
        sw = set(stopwords.words("english"))
        lemmatizer = WordNetLemmatizer()
        tokens = [
            lemmatizer.lemmatize(token.lower())
            for token in tokens
            if token.lower() not in sw
        ]

        # Remove duplicate tokens while maintaining order
        seen = set()
        unique_tokens = []
        for token in tokens:
            if token not in seen:
                seen.add(token)
                unique_tokens.append(token)

        # Remove "nan" tokens if present
        unique_tokens = [token for token in unique_tokens if token != 'nan']

        # If the resulting list is empty, return pd.NA
        if not unique_tokens:
            return pd.NA
        
        return unique_tokens  # Return the list of tokens
    else:
        return text  # Return the original value if it's not a string

In [4]:
# Process drug label text with two-token n-grams for context
def process_label_text_grams(text):
    if isinstance(text, str):  # Check if the input is a string
        # Define punctuation set
        punctuation = set(string.punctuation)
        
        # Replace punctuation with spaces
        for p in punctuation:
            text = text.replace(p, ' ')
        
        # Tokenize text using NLTK's word_tokenize
        tokens = word_tokenize(text)

        # Remove stopwords and lemmatize tokens
        sw = set(stopwords.words("english"))
        lemmatizer = WordNetLemmatizer()
        tokens = [
            lemmatizer.lemmatize(token.lower())
            for token in tokens
            if token.lower() not in sw
        ]

        # Remove duplicate tokens while maintaining order
        seen = set()
        unique_tokens = []
        for token in tokens:
            if token not in seen:
                seen.add(token)
                unique_tokens.append(token)
        
        # Generate bigrams
        bigrams = list(ngrams(unique_tokens, 2))
        
        # Combine tokens and bigrams into one list
        combined_tokens = unique_tokens + [' '.join(bigram) for bigram in bigrams]
        
        return combined_tokens  # Return the list of tokens
    else:
        return text  # Return the original value if it's not a string

In [5]:
# Create function to make unique IDs for each table
def add_sequential_index(df, index_col_name):

    # Reset the index and rename the index column to input index_col_name
    df = df.reset_index().rename(columns={"index": index_col_name})
    
    # Add 1 to index to start index from 1 instead of 0
    df[index_col_name] = df[index_col_name] + 1
    
    return df

In [6]:
# Function to return count of NaN and proportion of NaN in each column for a dataframe
def nan_info(df):
    # Count # of NA values
    nan_counts = df.isna().sum()
    
    # Calculate proportion of NA values
    prop_null = (nan_counts / len(df)) * 100
    
    # Create a DataFrame to store the information
    nan_info = pd.DataFrame({
        'column_name': nan_counts.index,
        'null_count': nan_counts.values,
        'null_proportion': prop_null.values
    })
    
    return nan_info

In [7]:
# Missing values to null for now (simplifies type conversions & plotting)
def na_to_null(df, column):
    df[column] = df[column].replace('N/A', np.nan)
    return df

In [8]:
# Function to remove duplicates
def remove_duplicates(tokens):
    return list(set(tokens))

In [9]:
# Function to remove duplicates and handle NaNs
def remove_duplicates_nan(tokens):
    if isinstance(tokens, list):
        # Remove "nan" tokens if present
        tokens = [token for token in tokens if token != 'nan']
        
        # Return pd.NA if the list is empty after removing "nan" tokens
        if not tokens:
            return pd.NA
        return list(set(tokens))
    else:
        return tokens

In [10]:
# Function to classify the product type
def classify_product_type(product_types):
    if 'human otc' in product_types:
        return 2
    elif 'human prescription' in product_types:
        return 1
    else:
        return 0

In [11]:
# Process medrap reaction terms
# Remove spacing and replace with a period, lowercase all letters



In [12]:
# Map age units to years, based on code specified here: https://open.fda.gov/apis/drug/event/searchable-fields/
def convert_to_years(age, unit):
    if pd.isna(unit):  # Check if value is NaN/None
        return np.nan
    elif unit == 800:  # Decade
        return age * 10
    elif unit == 801:  # Year
        return age
    elif unit == 802:  # Month
        return age / 12
    elif unit == 803:  # Week
        return age / 52
    elif unit == 804:  # Day
        return age / 365
    elif unit == 805:  # Hour
        return age / (365 * 24)
    else:
        return np.nan  # Return NaN for unknown units

In [16]:
# Return boxplot of character length for object columns, as well as descriptive statistics of character length
def plot_character_length(df, df_name):
    #filter for object columns
    documents_table_object_cols = df.select_dtypes(include=['object'])

    # Calculate the number of characters in each column
    character_counts = documents_table_object_cols.applymap(lambda x: len(str(x)))

    # Generate boxplot
    plt.figure(figsize=(10,8))
    sns.boxplot(data=character_counts, color='hotpink', orient = 'h')
    plt.title(f'Number of Characters in Each Object Column - {df_name}')
    plt.ylabel('Column')
    plt.xlabel('Number of Characters')
    plt.grid(True)
    plt.show()

    # Statistics Table
    stats_table = character_counts.describe().transpose()
    print("\nDescriptive Statistics on Character Length:")
    print(stats_table)

In [14]:
#function for detecting upper outliers
def examine_text_outliers(series):

    # acquire the mean and standard deviation of string lengths
    mean_length = series.str.len().mean()
    std_length = series.str.len().std()

    # calculate upper bound for outlier detection
    upper_bound = mean_length + 2 * std_length 

    # identify rows with string lengths above the upper bound
    upper_length_outliers = series[series.str.len() > upper_bound]

    return upper_length_outliers
