In [3]:
import json
import logging
import io
import boto3
import pandas as pd
import openpyxl
import spacy
import sklearn
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

logger = logging.getLogger()
logger.setLevel(logging.INFO)

def s3_event_info(event):
    '''
    Extracts the S3 bucket name and file key from a json created by an SNS trigger in response to a file being uploaded.
    
    Parameters: 
        event (json): a json file created from an SNS trigger. This event is created when a file is uploaded to the bucket to which this lambda will be applied. 
    
    Returns: 
        bucket_name (str): The s3 bucket name to which the file is uploaded
        file_key_attribute (str): The file key directing to the location of the file within the s3 bucket.
    '''
    
    # Get the details of the uploaded file from the event
    messagejson = json.loads(event['Records'][0]['Sns']['Message'])
    logging.info("Event has been loaded in.")
    
    # Extract the bucket key
    bucket_name = messagejson['Records'][0]['s3']['bucket']['name']
    if not isinstance(bucket_name, str):
        raise TypeError("The bucket_name grabbed was not a string")
    logging.info(f"The bucket name is: {bucket_name}")
    
    file_key_attribute = messagejson['Records'][0]['s3']['object']['key']
    if not isinstance(file_key_attribute, str):
        raise TypeError("The file_key grabbed was not a string")
    logging.info(f"The file key is: {file_key_attribute}")
    
    return [bucket_name, file_key_attribute]

def file_downloader(bucket_name, file_key_attribute):
    """
    Download a file from an S3 bucket.

    Parameters:
    bucket_name (str): The name of the S3 bucket.
    file_key_attribute (str): The key of the file in the S3 bucket.

    Returns:
    body (bytes): The content of the file.
    """
    s3 = boto3.client('s3')
    obj = s3.get_object(Bucket=bucket_name, Key=file_key_attribute)
    body = obj['Body'].read()
    return body

def sheet_name_grabber(iofile):
    """
    Extract sheet names from an Excel file.

    Parameters:
    iofile (bytes): The content of the Excel file.

    Returns:
    attribute_ex (openpyxl.workbook.workbook.Workbook): The Workbook object.
    sheet_names_attribute (list): A list of sheet names.
    """
    attribute_ex = openpyxl.load_workbook(io.BytesIO(iofile), data_only=True)
    sheet_names_attribute = attribute_ex.sheetnames
    return attribute_ex, sheet_names_attribute

def df_maker(workbook, sheetnamelist):
    """
    Convert each sheet in a Workbook into a DataFrame.

    Parameters:
    workbook (openpyxl.workbook.workbook.Workbook): The Workbook object.
    sheetnamelist (list): A list of sheet names.

    Returns:
    dflist (list): A list of DataFrames, each DataFrame corresponding to a sheet in the Workbook.
    """
    dflist = []
    for sheet_name in sheetnamelist:
        if 'Table' in sheet_name:
            ws = workbook[sheet_name]
            sheet_name = pd.DataFrame(ws.values)
            dflist.append(sheet_name)
    return dflist

def title_extracter(workbook, sheetnamelist):
    """
    Extract the title from each sheet in a Workbook.

    Parameters:
    workbook (openpyxl.workbook.workbook.Workbook): The Workbook object.
    sheetnamelist (list): A list of sheet names.

    Returns:
    titles (list): A list of titles.
    """
    titles = []
    for sheet_name in sheetnamelist:
        if 'Table' in sheet_name:
            ws = workbook[sheet_name]
            df = pd.DataFrame(ws.values)
            titles.append(df.iloc[0, 2])
    return titles

def df_header_fixer(df):
    """
    Fix the header of a DataFrame.

    Parameters:
    df (pandas.DataFrame): The DataFrame to fix.

    Returns:
    df (pandas.DataFrame): The DataFrame with fixed header.
    """
    df = df.rename(columns=df.iloc[2]).loc[3:]
    return df

def df_trunkater(df):
    """
    Remove empty rows in a DataFrame and reset its index.

    Parameters:
    df (pandas.DataFrame): The DataFrame to truncate.

    Returns:
    df (pandas.DataFrame): The DataFrame without empty rows and with reset index.
    """
    df = df[df['Attribute_Name'].notna()]
    df.index = np.arange(len(df))
    return df

def desc_filler(df):
    """
    Fill empty description cells in a DataFrame with corresponding attribute names.

    Parameters:
    df (pandas.DataFrame): The DataFrame to fill.

    Returns:
    df (pandas.DataFrame): The DataFrame with filled description cells.
    """
    df = df[df['Attribute_Name'].notna()]
    df['Description'].fillna(df['Attribute_Name'], inplace=True)
    return df


In [4]:
'''The following functions are used for the clusterer'''


def km_clusterer(X, cluster_count):
    '''
    Peform K-means clustering on the input data and finds the optimal number of clusters using silhouette scores.
    
    Parameters:
        X (vector): The input vector
        cluster_count (int): The initialized value for the optimal K.
        
    Returns:
        best_k (int): The optimal K value to maximize the silhouette score for K-means on the vector.
    '''
    # Define a range of k values to try
    k_values = range(2, 11)

    # Initialize lists to store silhouette scores and sample silhouette values
    silhouette_scores = []
    sample_silhouette_values = []

    # Iterate over different values of k
    for k in k_values:
        # Fit k-means clustering model
        kmeans = KMeans(n_clusters=k, n_init = 'auto', random_state=42)
        labels = kmeans.fit_predict(X)

        # Compute the silhouette score for this clustering
        silhouette_avg = silhouette_score(X, labels)
        silhouette_scores.append(silhouette_avg)

        # Compute the silhouette values for each sample
        sample_silhouette_values.append(silhouette_samples(X, labels))

    # Find the best k based on the maximum silhouette score
    best_k = k_values[np.argmax(silhouette_scores)]

    return best_k


def kmeansfunc(df, vec, cluster_count):
    '''
    Apply K-means clustering to the input dataframe and assign cluster label.
    
    Parameters:
        df (DataFrame): The input dataframe.
        vec (array-like): The input vector.
    
    Returns:
        DataFrame: The dataframe with cluster labels added.
    '''
    k = cluster_count
    kmeans = KMeans(n_clusters=k, n_init='auto', random_state=42)
    y_pred = kmeans.fit_predict(vec)
    df['y'] = y_pred

    return df


def tsne_transformer(vector):
    '''
    Perform t-SNE transformation on a vector
    
    Parameters:
        vector (array-like): The input vector.
    
    Returns:
        array-like: The transformed vector.
    '''
    tsne = TSNE(perplexity=20) #verbose=1, 
    vector = tsne.fit_transform(vector)

    return vector

def kmeans_plotter(df, vec):
    '''
    Create a scatterplot of the t-SNE'd data with labels
    
    Parameters:
        df (DataFrame): The input dataframe.
        vec (array-like): The input vector.
    '''
    X_embedded = vec
    y_pred = df['y']
    m = max(df['y'])+1
    # sns settings
    sns.set(rc={'figure.figsize': (13, 9)})

    # colors
    palette = sns.hls_palette(m, l=.4, s=.9)

    # plot
    sns.scatterplot(x=X_embedded[:, 0], y=X_embedded[:, 1], hue=y_pred, legend='full', palette=palette,)  
    plt.title('t-SNE with Kmeans Labels')
    plt.savefig("improved_cluster_tsne.png")
    plt.show()
    
def vectorizer_lister(df, cluster_count):
    '''
    Create a list of vectorizers for a dataframe, one for each cluster
    
    Parameters:
        df (DataFrame): The input dataframe.
        cluster_count (int): The number of clusters.
    
    Returns:
        list: The list of vectorizers.
    '''
    vectorizers = []
    for ii in range(0, cluster_count):
        vectorizers.append(TfidfVectorizer(lowercase=True, stop_words='english', ngram_range = (1, 4)))

    return vectorizers

def data_vectorizer(df, vectorizers):

    '''
    Vectorize the data in each dataframe
    
    Parameters:
        df (DataFrame): The input dataframe.
        vectorizers (list): The list of vectorizers.
    
    Returns:
        list: The list of vectorized data.
    '''
    vectorized_data = []
    for current_cluster, cvec in enumerate(vectorizers):
        vectorized_data.append(cvec.fit_transform(df.loc[df['y'] == current_cluster, 'Cleaned_Text']))

    return vectorized_data

def ld_allocater(df, cluster_count):
    '''
    Create a list of latent Dirichlet allocation models, one for each cluster
    
    Parameters:
        df (DataFrame): The input dataframe.
        cluster_count (int): The number of clusters.
    
    Returns:
        list: The list of latent Dirichlet allocation models.
    '''

    NUM_TOPICS_PER_CLUSTER = 1
    lda_models = []
        
    for ii in range(0, cluster_count):
        lda = LatentDirichletAllocation(n_components=NUM_TOPICS_PER_CLUSTER, max_iter=10, learning_method='online', verbose=False, random_state=42)
        lda_models.append(lda)
    return lda_models

def lda_clusterer(lda_models, vectorized_data):
    '''
    Apply LDA clustering to vectorized data.

    Parameters:
        lda_models (list): The list of Latent Dirichlet Allocation models.
        vectorized_data (list): The list of vectorized data.

    Returns:
        list: The list of LDA-clustered data.
    '''
    clusters_lda_data = []

    for current_cluster, lda in enumerate(lda_models):
        if vectorized_data[current_cluster] is not None:
            clusters_lda_data.append(lda.fit_transform(vectorized_data[current_cluster]))
    return clusters_lda_data

def lda_scorer(lda_models, vectorized_data):
    '''
    Calculates the scores for LDA models based on the vectorized data.

    Parameters:
        lda_models (list): List of LDA models.
        vectorized_data (list): List of vectorized data.

    Returns:
        list: List of scores for each LDA model.

    '''
    scores_lda_data = []

    for current_score, lda in enumerate(lda_models):
        if vectorized_data[current_score] is not None:
            scores_lda_data.append(lda.score(vectorized_data[current_score]))
    
    return scores_lda_data



def selected_topics(model, vectorizer, top_n=50):
    '''
    Function for printing keywords for each topic.

    Parameters:        
        model: The LDA model.
        vectorizer: The vectorizer.
        top_n (int): The number of top keywords to return for each topic.

    Returns:
        list: The list of top keywords for each topic.
    '''
    current_words = []
    keywords = []
    scores = []

    for idx, topic in enumerate(model.components_):
        words = [(vectorizer.get_feature_names_out()[i], topic[i]) for i in topic.argsort()[:-top_n - 1:-1]] #returns the words corresponding to the top_n scores for each topic
            
        for word in words:
            if word[0] not in current_words:  #Unpacks the vector to extract the most relevant tags
                keywords.append(word)
                current_words.append(word[0]) #avoid adding repeat terms by keeping a second list with only the words

    keywords.sort(key=lambda x: x[1])
    keywords.reverse()
    return_values = []
    for ii in keywords:
        return_values.append(ii)

    return return_values


def keyword_lister(lda_models, vectorized_data, vectorizers):
    '''
    Compile keywords for each table.

    Parameters:
        lda_models (list): The list of LDA models.
        vectorized_data (list): The list of vectorized data.
        vectorizers (list): The list of vectorizers.

    Returns:
        list: The list of keywords for each table.
    '''
    all_keywords = []
    for current_vectorizer, lda in enumerate(lda_models):
        if vectorized_data[current_vectorizer] is not None:
            all_keywords.append(selected_topics(lda, vectorizers[current_vectorizer]))
    termslist = []
    scoreslist = []
    
    return all_keywords

def unpack_tags(df):
    '''
    Unpacks the tags in a dataframe by creating a new dataframe with each tag as a separate row.

    Parameters:
        df (DataFrame): The input dataframe containing 'Table_Number' and 'Tags' columns.

    Returns:
        DataFrame: The unpacked dataframe with 'Table_Number' and 'Tags' columns.

    '''
    # Create empty lists to store the unpacked tags and table numbers
    unpacked_tags = []
    table_numbers = []

    # Iterate through each row in the dataframe
    for _, row in df.iterrows():
        tags = row['Tags']
        table_number = row['Table_Number']

        # Iterate through each sublist of tags
        for sublist in tags:
            # Extend the unpacked_tags list with each tag from the sublist
            unpacked_tags.extend(sublist)

            # Extend the table_numbers list with the corresponding table number
            table_numbers.extend([table_number] * len(sublist))
    
    # Create a new dataframe with columns 'Table_Number' and 'Tags' containing the unpacked tags
    df_unpacked = pd.DataFrame({'Table_Number': table_numbers, 'Tags': unpacked_tags})
    return df_unpacked


def minmaxscaler(df):
    '''
    Scales the scores in the dataframe using min-max scaling.

    Parameters:
        df (DataFrame): The input dataframe with a 'Score' column.

    Returns:
        DataFrame: The dataframe with the scaled 'Score' column.

    '''
    if 'Scores' in df.columns:
        df['Score'] = df['Scores']
        df = df.drop('Scores', axis=1)
    
    df['Score'] = df['Score'] / max(df['Score'])
    return df


def dfcombiner(df1, df2):
    '''
    Combines two dataframes based on the 'Tags' column and calculates a final score.

    Parameters:
        df1 (DataFrame): The first input dataframe.
        df2 (DataFrame): The second input dataframe.

    Returns:
        DataFrame: The combined dataframe with a calculated 'Final_Score' column.

    '''
    df = pd.merge(df1, df2, on='Tags', how='outer')

    min_score_y = df['Score_y'].min()
    min_score_x = df['Score_x'].min()

    df['Score_y'].fillna(min_score_y, inplace=True)
    df['Score_x'].fillna(min_score_x, inplace=True)

    df['Final_Score'] = (df['Score_y'] + df['Score_x']) / 2

    df.drop('Score_x', axis=1, inplace=True)
    df.drop('Score_y', axis=1, inplace=True)

    df['Table_Number'] = radcom_df['Table_Number_y'].combine_first(df['Table_Number_x'])
    df.drop('Table_Number_x', axis=1, inplace=True)
    df.drop('Table_Number_y', axis=1, inplace=True)

    df = df.sort_values(by='Final_Score', ascending=False)
    
    return df


In [5]:
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from sklearn.base import TransformerMixin
from sklearn.manifold import TSNE
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import normalize
from sklearn.preprocessing import Normalizer


'''These functions are used to create word vectors for the data, including tf-idf vectorization'''

def reference_grabber():
    """Retrieve and process data from CSV files based on file keys.

    Reads file keys from 'file_keys_refs.json' to download and process CSV files
    from a specified bucket. The CSV files contain abbreviation and definition data.
    
    Returns:
        tuple: A tuple containing two pandas DataFrames.
            - The first DataFrame (abbdf) contains the abbreviation data with columns 'Abbreviation' and 'Term'.
            - The second DataFrame (defsdf) contains the definition data with columns 'Term' and 'Definition'.
    """

    # Load file keys from 'file_keys_refs.json'
    file_name = 'file_keys_refs.json'
    with open(file_name, 'r') as f:
        file_keys = json.load(f)

    # Extract bucket name and file keys for abbreviation and definition data
    bucket_name = file_keys['bucket_name']
    abb_key = file_keys['file_key_abb']
    defs_key = file_keys['file_key_defs']

    # Download and process abbreviation data
    abbdf = file_downloader(bucket_name, abb_key)
    abbdf = pd.read_csv(io.BytesIO(abbdf))
    abbdf = abbdf.drop('Unnamed: 0', axis=1)

    # Download and process definition data
    defsdf = file_downloader(bucket_name, defs_key)
    defsdf = pd.read_csv(io.BytesIO(defsdf))
    defsdf = defsdf.drop('Unnamed: 0', axis=1)

    # Convert abbreviation and term data to lowercase
    abbdf['Abbreviation'] = abbdf['Abbreviation'].str.lower()
    defsdf['Term'] = defsdf['Term'].str.lower()

    return abbdf, defsdf

# Create our list of stopwords
nlp = spacy.load("en_core_web_md")

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Basic function to clean the text
def clean_text(text):
    '''
    Remove spaces and convert text into lowercase
    
    Parameters:
        text (str): The input text to be cleaned.
    
    Returns:
        str: The cleaned text.
    '''
    return text.strip().lower()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    '''
    Tokenize a sentence using spaCy, performing lemmatization, lowercase conversion, and removing stop words.
    
    Parameters:
        sentence (str): The input sentence to be tokenized.
    
    Returns:
        list: The list of tokens.
    '''
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)
    
    # Lemmatizing each token and converting each token into lowercase
    for word in mytokens:
        if word.lemma_ != "-PRON-":
            word.lemma_.lower().strip()
        else:
            word = word.lower_
    
    # Removing stop words and punctuation
    mytokens = [word for word in mytokens if word not in STOP_WORDS and not word.is_punct]

    # Return preprocessed list of tokens
    return mytokens

# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        '''
        Transform the input X by cleaning the text.
        
        Parameters:
            X (list): The input list of texts to be transformed.
        
        Returns:
            list: The transformed list of cleaned texts.
        '''
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        '''
        Fit the transformer.
        
        Parameters:
            X (list): The input list of texts.
            y: Ignored parameter.
        
        Returns:
            self: The fitted transformer object.
        '''
        return self

    def get_params(self, deep=True):
        '''
        Get parameters for the transformer.
        
        Parameters:
            deep: Ignored parameter.
        
        Returns:
            dict: The parameters of the transformer.
        '''
        return {}

def cleaner(df):
    '''
    Clean the 'Description' column in the dataframe by applying the clean_text function.
    
    Parameters:
        df (DataFrame): The input dataframe.
    
    Returns:
        DataFrame: The dataframe with the 'Cleaned_Text' column added.
    '''
    df['Cleaned_Text'] = df['Description'].apply(clean_text)
    return df

tfidf_vector = sklearn.feature_extraction.text.TfidfVectorizer(max_df = 0.9, ngram_range = (1,4), stop_words = 'english')

def vectorizer(df):
    '''
    Vectorize the 'Cleaned_Text' column of the dataframe using TF-IDF vectorization.
    
    Parameters:
        df (DataFrame): The input dataframe.
    
    Returns:
        array-like: The vectorized data.
    '''
    X = df['Cleaned_Text']
    X = tfidf_vector.fit_transform(X)
    feature_names = tfidf_vector.get_feature_names_out()
    return X, feature_names

def chunkvectorizer(df):
    '''
    Vectorize the 'Chunks' column of the dataframe using TF-IDF vectorization.
    
    Parameters:
        df (DataFrame): The input dataframe.
    
    Returns:
        array-like: The vectorized data.
    '''
    doctext = df['Cleaned_Text'].apply(nlp)
    noun_phrases = []
    for i in doctext:
        current_list = []
        for chunk in i.noun_chunks:
            current_list.append(chunk)
        noun_phrases.append(current_list)
    df['Chunks'] = noun_phrases
    tsl = []
    noun_phrases = df['Chunks']
    for i in noun_phrases:
        token_strings = [token.text for token in i]
        tsl.append(token_strings)
    chunklist = []
    
    for lisst in tsl:
        for chunk in lisst:
            chunklist.append(chunk)
    print(chunklist)
    vectorizertest = TfidfVectorizer()
    X = vectorizertest.fit_transform(chunklist)    
    return X

def pcareducer(X):
    '''
    Reduce the dimensionality of the input data using PCA.
    
    Parameters:
        X (array-like): The input data to be reduced.
    
    Returns:
        array-like: The reduced data.
    '''
    pca = PCA(n_components=0.95, random_state=42)
    X_reduced = pca.fit_transform(X.toarray())
    return X_reduced

def vectortodf(X, feature_names):
    '''
    Find the words with TF-IDF values >= threshold from the vectorized data.
    
    Parameters:
        X (array-like): The vectorized data.
    
    Returns:
        DataFrame: The dataframe containing the words and their TF-IDF scores.
    '''
    goodwords = []
    goodvals = []
    
    for row in range(X.shape[0]):  # Iterate over each row in the matrix
        for col in X[row, :].nonzero()[1]:  # Get the indices of non-zero elements in the row
            word = feature_names[col]
            
            if word not in goodwords:
                goodwords.append(word)
                goodvals.append(X[row, col])
                  
    tagsdf = pd.DataFrame(data={'Tags': goodwords, 'Score': goodvals})
  
    return tagsdf

def cosvectortodf(X, feature_names):
    '''
    Converts a cosine similarity matrix and feature names to a DataFrame.

    Parameters:
        X (numpy.ndarray): The cosine similarity matrix.
        feature_names (list): List of feature names.

    Returns:
        DataFrame: The DataFrame with 'Tags' and 'Score' columns.

    '''
    X = X.transpose()
    df = pd.DataFrame({'Tags': feature_names})

    similarity_scores = cosine_similarity(X)

    scores = similarity_scores.sum(axis=1)

    df = pd.DataFrame({'Tags': feature_names, 'Score': scores})

    df = df.sort_values(by='Score', ascending=False)

    df = df.reset_index(drop=True)
    return df

def dflistconcat(dflist):
    '''
    Concatenates a list of DataFrames into a single DataFrame.

    Parameters:
        dflist (list): List of DataFrames.

    Returns:
        DataFrame: The concatenated DataFrame.

    '''
    Tablenum = 1
    for i in dflist:
        i['Table_Number'] = Tablenum
        Tablenum += 1
    df = pd.concat(dflist)
    cols = list(df.columns)
    cols.reverse()
    df = df[cols]
    return df

def eliminate_contained_words(df):
    '''
    Eliminates rows containing tags that are subsets of other tags within each group.

    Parameters:
        df (DataFrame): The input DataFrame with 'Tags' column.

    Returns:
        DataFrame: The filtered DataFrame without contained words.

    '''
    def eliminate_within_group(group):
        tags = group['Tags'].tolist()
        indices_to_remove = []

        for i, tag in enumerate(tags):
            if any(tag in other_tag for j, other_tag in enumerate(tags) if j != i):
                indices_to_remove.append(i)

        return group.drop(group.index[indices_to_remove])

    if 'Table_Number' in df.columns:
        df_filtered = df.groupby('Table_Number').apply(eliminate_within_group).reset_index(drop=True)
    if 'Table_Name' in df.columns:
        df_filtered = df.groupby('Table_Name').apply(eliminate_within_group).reset_index(drop=True)

    return df_filtered

def termchecker(df, abbdic, termdic):
    '''
    Adjusts the scores in the DataFrame based on matching tags with abbreviation and term dictionaries.

    Parameters:
        df (DataFrame): The input DataFrame with 'Tags' and 'Score' columns.
        abbdic (DataFrame): The abbreviation dictionary with 'Abbreviation' column.
        termdic (DataFrame): The term dictionary with 'Term' column.

    Returns:
        DataFrame: The DataFrame with adjusted scores.

    '''
    multiplier = max(df['Score'] / df['Score'].mean())
    df['Score'] = df.apply(lambda row: row['Score'] * multiplier if row['Tags'] in abbdic['Abbreviation'].values else row['Score'], axis=1)
    df['Score'] = df.apply(lambda row: row['Score'] * multiplier if row['Tags'] in termdic['Term'].values else row['Score'], axis=1)

    return df

def topscorefilter(df, numberoftags):
    '''
    Filters the DataFrame to retain the top tags based on the specified number of tags.

    Parameters:
        df (DataFrame): The input DataFrame with 'Score' column.
        numberoftags (int): The number of top tags to retain.

    Returns:
        DataFrame: The filtered DataFrame with the top tags.

    '''
    sort_column = 'Ensemble_Score' if 'Ensemble_Score' in df.columns else 'Score'
    group_column = 'Table_Name' if 'Table_Name' in df.columns else 'Table_Number'

    def filter_within_group(group):
        return group.nlargest(numberoftags, sort_column)

    df_filtered = df.groupby(group_column).apply(filter_within_group)

    df_filtered = df_filtered.reset_index(drop=True)

    return df_filtered

def replace_table_number_with_title(df, titles_list):
    '''
    Replace the values in the "Table_Number" column of the DataFrame with the corresponding titles from the titles list.

    Parameters:
        df (DataFrame): The DataFrame containing the "Table_Number" column.
        titles_list (list): The list of titles corresponding to the table numbers.

    Returns:
        DataFrame: The modified DataFrame with the "Table_Number" column replaced by titles.
    '''
    df['Table_Name'] = df['Table_Number'].map(lambda x: titles_list[int(x) - 1])
    df.drop('Table_Number', axis = 1, inplace=True)
    return df

def quanttrans(df):
    '''
    Apply quantile transformation to the 'Score' column of the input DataFrame.

    This function uses QuantileTransformer from scikit-learn to perform quantile transformation,
    which maps the data to a normal distribution. The number of quantiles used is determined by
    the length of the DataFrame divided by 4.

    Note: This function modifies the original DataFrame by replacing the 'Score' column.

    Parameters:
        df (pandas.DataFrame): The input DataFrame containing the 'Score' column to be transformed.

    Returns:
        pandas.DataFrame: The DataFrame with the 'Score' column quantile-transformed.
    '''
    qt = QuantileTransformer(n_quantiles = int(len(df)/4), random_state=42, output_distribution='normal')
    X = df['Score'].values.reshape(-1, 1)  # Reshape to a 2D array
    X = qt.fit_transform(X)
    df['Score'] = X
    return df

def get_characters_before_slash(input_string):
    """
    Get all the characters in a string before the first '/' character.
    This is used to find the folder name that a file was downloaded from so that it can be catalogged in the database
    
    Parameters:
        input_string (str): The input string from which characters will be extracted.

    Returns:
        str: The substring of the input string from the beginning up to, but not including,
             the first occurrence of the '/' character. If the '/' character is not present,
             the entire input string is returned.
    """
    index = input_string.find('/')
    if index != -1:
        return input_string[:index]
    else:
        return input_string

In [6]:
def import_pipeline(bucket, file_key):
    '''
    Pipeline the import process, beginning with an S3 file location and eventually outputting a list of dataframes, one for each table in the spreadsheet.
    
    Parameters:
        bucket (str): The S3 bucket name.
        file_key (str): The file key directing to the location of the file within the S3 bucket.
    
    Returns:
        list: The list of dataframes, one for each table in the spreadsheet.
    '''
    body = file_downloader(bucket, file_key)
    workbook, sheet_names = sheet_name_grabber(body)
    dflist = df_maker(workbook, sheet_names)
    titles = title_extracter(workbook, sheet_names)
    return dflist, titles

def import_pipeline_local(local_file_location):
    '''
    Pipeline the import process, beginning with an S3 file location and eventually outputting a list of dataframes, one for each table in the spreadsheet.
    
    Parameters:
        bucket (str): The S3 bucket name.
        file_key (str): The file key directing to the location of the file within the S3 bucket.
    
    Returns:
        list: The list of dataframes, one for each table in the spreadsheet.
    '''
    body = local_file_location
    workbook, sheet_names = sheet_name_grabber(body)
    dflist = df_maker(workbook, sheet_names)
    titles = title_extracter(workbook, sheet_names)
    return dflist, titles

def pipeline(dataframelist, functions):
    '''
    Apply a list of functions to each dataframe in the input list.
    
    Parameters:
        dataframelist (list): The list of dataframes.
        functions (list): The list of functions to be applied to each dataframe.
    
    Returns:
        list: The list of processed dataframes.
    '''
    results = []
    for df in dataframelist:
        for func in functions:
            df = func(df)  # Apply each function in sequence
        results.append(df)  # Add each processed dataframe to the list of processed dataframes
    return results  # Return the list of processed dataframes

def cluster_pipeline(dflist, vectorlist, cluster_count):

    '''
    Run the pipeline to perform clustering and obtain tags for each dataframe.
    
    Parameters:
        dflist (list): The list of dataframes.
        vectorlist (list): The list of vectorized data.
        cluster_count (int): The initialized number of clusters.
    
    Returns:
        DataFrame: The dataframe with tags for each table.
    '''

    #Find the optimal number of clusters for the model, effectively creating our number of topics
    clustervectorlist = vectorlist.copy()
    
    for i in range(len(vectorlist)):
        clustervectorlist[i] = pcareducer(clustervectorlist[i])
    
    cluster_count = []
    for i in range(len(vectorlist)):
        cluster_count.append(km_clusterer(clustervectorlist[i], cluster_count))

    # Perform K-Means clustering on each dataframe
    for i in range(len(dflist)):
        dflist[i] = kmeansfunc(dflist[i], clustervectorlist[i], cluster_count[i])

    # Perform t-SNE transformation on each vector
    for i in range(len(clustervectorlist)):
        clustervectorlist[i] = tsne_transformer(clustervectorlist[i])
  
    # Create lists of vectorizers
    vectorizers = []
    for i in range(len(dflist)):
        vectorizers.append(vectorizer_lister(dflist[i], cluster_count[i]))

    # Vectorize the data
    vectorized_data = []
    for i in range(len(dflist)):
        vectorized_data.append(data_vectorizer(dflist[i], vectorizers[i]))

    # Make LDA Models
    lda_models = []
    for i in range(len(dflist)):
        lda_models.append(ld_allocater(dflist[i], cluster_count[i]))

    # Make the LDA Clusters
    lda_clusters = []
    for i in range(len(lda_models)):
        lda_clusters.append(lda_clusterer(lda_models[i], vectorized_data[i]))
    
    #Test the scorer
    lda_scores = []    
    for i in range(len(lda_models)):
        lda_scores.append(lda_scorer(lda_models[i], vectorized_data[i])) 

    # Get the keywords
    keywords = []
    scorelist = []
    for i in range(len(lda_models)):
        keywords.append(keyword_lister(lda_models[i], vectorized_data[i], vectorizers[i]))
    tagslist = keywords

    # Make the Dataframes
    tablenums = []
    for i in range(len(dflist)):

        tablenums.append(i + 1)
  
    df = pd.DataFrame({'Table_Number': tablenums, 'Tags': keywords})
    
    # eliminate the duplicate keywords
    df = unpack_tags(df)
    df['Score'] = df['Tags'].apply(lambda x: x[1])
    df['Tags'] = df['Tags'].apply(lambda x: x[0])
    df = eliminate_contained_words(df)
    df = minmaxscaler(df)
    df = quanttrans(df)
 
    return df

def cos_pipeline(veclist, feature_names_list, abbdic, termdic):
    '''
    Performs the cosine similarity pipeline to create a DataFrame with adjusted scores.

    Parameters:
        veclist (list): List of cosine similarity matrices.
        feature_names_list (list): List of feature names for each cosine similarity matrix.
        abbdic (DataFrame): The abbreviation dictionary with 'Abbreviation' column.
        termdic (DataFrame): The term dictionary with 'Term' column.

    Returns:
        DataFrame: The processed DataFrame with adjusted scores.

    '''
    dflist = []
    for i in range(len(veclist)):
        dflist.append(cosvectortodf(veclist[i], feature_names_list[i]))
    df = dflistconcat(dflist) 
    df = termchecker(df, abbdic, termdic)
    df = eliminate_contained_words(df)
    df = minmaxscaler(df)
    df = quanttrans(df)

    return df

def tfidf_pipeline(veclist, feature_names_list, abbdic, termdic):
    '''
    Performs the TF-IDF pipeline to create a DataFrame with adjusted scores.

    Parameters:
        veclist (list): List of TF-IDF matrices.
        feature_names_list (list): List of feature names for each TF-IDF matrix.
        abbdic (DataFrame): The abbreviation dictionary with 'Abbreviation' column.
        termdic (DataFrame): The term dictionary with 'Term' column.

    Returns:
        DataFrame: The processed DataFrame with adjusted scores.

    '''
    dflist = []
    for i in range(len(veclist)):
        dflist.append(vectortodf(veclist[i], feature_names_list[i]))
    df = dflistconcat(dflist)
    df = termchecker(df, abbdic, termdic)
    df = eliminate_contained_words(df)
    df = minmaxscaler(df)
    df = quanttrans(df)
    
    return df

def generate_tagsdf(clustertagsdf, costagsdf, tfidftagsdf, titleslist, numberoftags): #keep this one
    """
    Generate the 'tagsdf' DataFrame by merging the three input DataFrames and applying filtering based on the ensemble score.

    Parameters:
        clustertagsdf (DataFrame): DataFrame containing cluster-based tags.
        costagsdf (DataFrame): DataFrame containing CO-based tags.
        tfidftagsdf (DataFrame): DataFrame containing TF-IDF-based tags.
        titleslist (list): List of titles corresponding to the table numbers.
        numberoftags (int): Number of top tags per table according to the ensemble score.

    Returns:
        DataFrame: The resulting 'tagsdf' DataFrame after merging and filtering.
    """
    # Merge the three dataframes based on 'Tags'
    merged_df = clustertagsdf.merge(costagsdf, on='Tags').merge(tfidftagsdf, on='Tags')

    # Create the ensemble score as the sum of the three scores
    merged_df['Ensemble_Score'] = merged_df['Score_x'] + merged_df['Score_y'] + merged_df['Score']
    
    # Replace 'Table_Number' with titles
    tagsdf = replace_table_number_with_title(merged_df, titleslist)

    # Drop unnecessary columns
    tagsdf = tagsdf.drop(merged_df.columns.difference(['Ensemble_Score', 'Table_Name', 'Tags']), axis=1)

    # Filter out contained words
    tagsdf = eliminate_contained_words(tagsdf)
    
    # Filter the top tags for each table according to the ensemble score
    tagsdf = topscorefilter(tagsdf, numberoftags)

    return tagsdf

def ensemble_pipeline(bucket, file_key, functions, vectorfuncs, cluster_count, numberoftags):

    '''
    Run the final pipeline to import, process, cluster, and obtain tags for each table in the spreadsheet.
    
    Parameters:
        bucket (str): The S3 bucket name.
        file_key (str): The file key directing to the location of the file within the S3 bucket.
        functions (list): The list of functions to be applied during the import process.
        vectorfuncs (list): The list of functions to be applied during the vectorization process.
        cluster_count (int): The number of clusters.
    
    Returns:
        DataFrame: The dataframe with tags for each table.
    '''
    data, titleslist = import_pipeline(bucket, file_key)
    dflist = pipeline(data, functions)
    abbdf, defsdf = reference_grabber()

    vectorizer_results = pipeline(dflist, vectorfuncs)

    # Extract the first values (vectorized_texts_list) and second values (feature_names_list)
    vectorized_texts_list = [x[0] for x in vectorizer_results]
    feature_names_list = [x[1] for x in vectorizer_results]
    del vectorizer_results
    
    tfidftagsdf = tfidf_pipeline(vectorized_texts_list, feature_names_list, abbdf, defsdf)
    clustertagsdf = cluster_pipeline(dflist, vectorized_texts_list, cluster_count)
    costagsdf = cos_pipeline(vectorized_texts_list, feature_names_list, abbdf, defsdf)
    
    # Create the final dataframe
    print('Ensemble Dataframe')
    tagsdf = generate_tagsdf(clustertagsdf, costagsdf, tfidftagsdf, titleslist, numberoftags)
    location = get_characters_before_slash(file_key)
    tagsdf['Location'] = location
    tagsdf['Bucket'] = bucket
    display(tagsdf)
    
    return tagsdf


In [7]:
'''Getting file keys and bucket name for S3 import from local json. These will serve as different example use cases.  In practice, these file keys will be extracted from the json created by an sns trigger. These were examples used during development.'''
file_name = 'file_keys.json'

with open(file_name, 'r') as f:
    file_keys = json.load(f)
    
bucket_name = file_keys['bucket_name']
file_key_radcom = file_keys['file_key_radcom']
file_key_boost = file_keys['file_key_boost']
file_key_eks = file_keys['file_key_eks']
file_key_prometheus = file_keys['file_key_prometheus']

In [8]:
'''Execute the example cases by running the final ensemble pipeline.  Ensemble pipeline takes in the parameters, and outputs a dataframe that, in practice, will be uploaded to an AuroraDB to store the information for future querying.'''
%%time
#Compile the list of functions for the different steps
functions = [df_header_fixer, df_trunkater, desc_filler, cleaner] 
vectorfuncs= [vectorizer]

print('Radcom')
radcom_df = ensemble_pipeline(bucket_name, file_key_radcom, functions, vectorfuncs, 1, 3)
print('Boost')
boost_df = ensemble_pipeline(bucket_name, file_key_boost, functions, vectorfuncs, 1, 3)
print('EKS')
eks_df = ensemble_pipeline(bucket_name, file_key_eks, functions, vectorfuncs, 1, 3)
print('Prometheus')
prometheus_df = ensemble_pipeline(bucket_name, file_key_prometheus, functions, vectorfuncs, 1, 3)


Radcom
Ensemble Dataframe


Unnamed: 0,Tags,Ensemble_Score,Table_Name,Location,Bucket
0,cscf,5.451571,dish-5G.enterprise-core.probes.subscriber.cdr-...,radcom,dq-metadata
1,impu,5.374294,dish-5G.enterprise-core.probes.subscriber.cdr-...,radcom,dq-metadata
2,origin realm,2.320212,dish-5G.enterprise-core.probes.subscriber.cdr-...,radcom,dq-metadata
3,traffic based ne configuration,1.363397,dish-5G.enterprise-core.probes.subscriber.cdr-...,radcom,dq-metadata
4,plane traffic based ne,1.340876,dish-5G.enterprise-core.probes.subscriber.cdr-...,radcom,dq-metadata
5,user plane traffic based,1.32203,dish-5G.enterprise-core.probes.subscriber.cdr-...,radcom,dq-metadata
6,tool tip,5.218656,dish-5G.enterprise-core.probes.subscriber.cdr-...,radcom,dq-metadata
7,stream id,1.053991,dish-5G.enterprise-core.probes.subscriber.cdr-...,radcom,dq-metadata
8,service operation,0.729475,dish-5G.enterprise-core.probes.subscriber.cdr-...,radcom,dq-metadata
9,amf based ne configuration,0.885609,dish-5G.enterprise-core.probes.subscriber.cdr-...,radcom,dq-metadata


Boost
Ensemble Dataframe


Unnamed: 0,Tags,Ensemble_Score,Table_Name,Location,Bucket
0,activity transfer,1.574646,dish_retail_dl.att.att_data_cdr,BOOST,dq-metadata
1,session type data,1.200856,dish_retail_dl.att.att_data_cdr,BOOST,dq-metadata
2,indicates home space,1.004838,dish_retail_dl.att.att_data_cdr,BOOST,dq-metadata
3,field 00,4.314727,dish_retail_dl.att.att_voice_cdr,BOOST,dq-metadata
4,number billing systems provide,1.378006,dish_retail_dl.att.att_voice_cdr,BOOST,dq-metadata
5,phone number billing systems,1.336692,dish_retail_dl.att.att_voice_cdr,BOOST,dq-metadata


EKS
Ensemble Dataframe


Unnamed: 0,Tags,Ensemble_Score,Table_Name,Location,Bucket
0,container_memory_utilization,0.222829,dish_wireless.dp.container_insights,EKS,dq-metadata
1,node_diskio_io_serviced_async,-1.052498,dish_wireless.dp.container_insights,EKS,dq-metadata
2,json container features,-1.174894,dish_wireless.dp.container_insights,EKS,dq-metadata


Prometheus
Ensemble Dataframe


Unnamed: 0,Tags,Ensemble_Score,Table_Name,Location,Bucket
0,annotations converted prometheus labels,0.81366,dish-5G.wireless.prometheus-metrics,Metrics,dq-metadata
1,kubernetes annotations converted prometheus,0.802712,dish-5G.wireless.prometheus-metrics,Metrics,dq-metadata
2,kubernetes labels converted prometheus,0.798182,dish-5G.wireless.prometheus-metrics,Metrics,dq-metadata


CPU times: user 35.9 s, sys: 1.21 s, total: 37.1 s
Wall time: 30.3 s
