In [14]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from nltk.corpus import stopwords
import string
import pandas as pd
from string import punctuation
punctuation = set(punctuation)
from nltk.stem import PorterStemmer



def process_text(text):
    '''Function to process text fields.
    Involves removing punctuation, tokenizing text, removing stopwords, lemmatizing tokens, folding to lowercase.'''
    # Define punctuation set
    punctuation = set(string.punctuation)
    # Add additional punctuation character
    punctuation.update({'‘'})

    # Tokenize text using NLTK's word_tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords and punctuation, and lemmatize tokens
    sw = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    tokens = [
        lemmatizer.lemmatize(token.lower())
        for token in tokens
        if token.lower() not in sw and all(char not in punctuation for char in token)
    ]
    
    return tokens

In [15]:
# process text with 2-token n-grams for language context
def process_text_grams(text):
    # Define punctuation set
    punctuation = set(string.punctuation)
    # Add additional punctuation character
    punctuation.update({'‘'})

    # Tokenize text using NLTK's word_tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords and punctuation, and lemmatize tokens
    sw = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    tokens = [
        lemmatizer.lemmatize(token.lower())
        for token in tokens
        if token.lower() not in sw and all(char not in punctuation for char in token)
    ]
    
    # Generate bigrams
    bigrams = list(ngrams(tokens, 2))
    
    # Combine tokens and bigrams into one list
    combined_tokens = tokens + [' '.join(bigram) for bigram in bigrams]
    
    return combined_tokens

In [16]:
# Define the text processing function for the drug labels specifically
def process_label_text(text):
    if isinstance(text, str):  # Check if the input is a string
        # Define punctuation set
        punctuation = set(string.punctuation)
        
        # Replace punctuation with spaces
        for p in punctuation:
            text = text.replace(p, ' ')
        
        # Tokenize text using NLTK's word_tokenize
        tokens = word_tokenize(text)

        # Remove stopwords and lemmatize tokens
        sw = set(stopwords.words("english"))
        lemmatizer = WordNetLemmatizer()
        tokens = [
            lemmatizer.lemmatize(token.lower())
            for token in tokens
            if token.lower() not in sw
        ]

        # Remove duplicate tokens while maintaining order
        seen = set()
        unique_tokens = []
        for token in tokens:
            if token not in seen:
                seen.add(token)
                unique_tokens.append(token)
        
        return unique_tokens  # Return the list of tokens
    else:
        return text  # Return the original value if it's not a string


In [17]:
# Process drug label text with two-token n-grams for context
def process_label_text_grams(text):
    if isinstance(text, str):  # Check if the input is a string
        # Define punctuation set
        punctuation = set(string.punctuation)
        
        # Replace punctuation with spaces
        for p in punctuation:
            text = text.replace(p, ' ')
        
        # Tokenize text using NLTK's word_tokenize
        tokens = word_tokenize(text)

        # Remove stopwords and lemmatize tokens
        sw = set(stopwords.words("english"))
        lemmatizer = WordNetLemmatizer()
        tokens = [
            lemmatizer.lemmatize(token.lower())
            for token in tokens
            if token.lower() not in sw
        ]

        # Remove duplicate tokens while maintaining order
        seen = set()
        unique_tokens = []
        for token in tokens:
            if token not in seen:
                seen.add(token)
                unique_tokens.append(token)
        
        # Generate bigrams
        bigrams = list(ngrams(unique_tokens, 2))
        
        # Combine tokens and bigrams into one list
        combined_tokens = unique_tokens + [' '.join(bigram) for bigram in bigrams]
        
        return combined_tokens  # Return the list of tokens
    else:
        return text  # Return the original value if it's not a string

In [18]:
def add_sequential_index(df, index_col_name):
    """
    Add sequential index column to dataframe table. Inputs are the dataframe, and desired name of index column. 
    """
    # Reset the index and rename the index column to input index_col_name
    df = df.reset_index().rename(columns={"index": index_col_name})
    
    # Add 1 to index to start index from 1 instead of 0
    df[index_col_name] = df[index_col_name] + 1
    
    return df

In [19]:
# Missing values to null for now (simplifies type conversions & plotting)
def na_to_null(df, column):
    df[column] = df[column].replace('N/A', np.nan)
    return df

In [20]:
# Function to remove duplicates
def remove_duplicates(tokens):
    return list(set(tokens))

In [21]:
# Function to classify the product type
def classify_product_type(product_types):
    if 'human otc' in product_types:
        return 2
    elif 'human prescription' in product_types:
        return 1
    else:
        return 0