In [1]:
import os
import re
import gensim
import string
import nltk
import pickle
import numpy as np
import pandas as pd
from numpy import percentile
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import preprocessing_module

In [3]:
def predict_data_drift(unseen_report, tfidf=bool):
    
    # Create a dataframe for the unseen report using the create_dataframe function
    
    content = []
    file_names = []
    for filename in os.listdir(unseen_report):
        with open(os.path.join(unseen_report, filename), 'r') as file:
            content.append(file.read())
            file_names.append(filename)

    df_unseen = pd.DataFrame({'Filename': file_names, 'Content': content})

    # Preprocess the text in the dataframe using the preprocess_text function
    df_unseen = preprocessing_module.preprocess_text(df_unseen, 'Content')
    
      
    if (tfidf==1):
        
        print("This is TF-IDF Calculation")
        
        # Initialize the TF-IDF vectorizer
        with open("vectorizer.pkl", "rb") as f:
            vectorizer = pickle.load(f)

        # Apply the same vectorizer on the test documents
        test = vectorizer.transform(df_unseen["Content"])
        
        # Initialize train vectorizer
        with open("train(tfidf).pkl", "rb") as f:
            train = pickle.load(f)

        # Initialize threshold vectorizer
        with open("threshold(tfidf).pkl", "rb") as f:
            threshold = pickle.load(f)
            
        # Calculate Cosine Similarity
        similarity_matrix = cosine_similarity(train, test) 

        # Calculate mean similarity for each row (i.e.; via test dataset)
        mean_similarity = np.mean(similarity_matrix, axis=0)
        
    else:
        
        print("This is Word2Vec Calculation")
        
        # Initialize the train embeddings
        with open("train_embeddings(w2v).pkl", "rb") as f:
            train_embeddings = pickle.load(f)

        # Initialize the threshold 
        with open("threshold(w2v).pkl", "rb") as f:
            threshold = pickle.load(f)
        
        # Initialize the Word2Vec Embedding
        with open("model.pkl", "rb") as f:
            model = pickle.load(f)

        # Applying same Word2Vec Embedding on the test document
        unseen_embeddings = [np.mean([model.wv[token] for token in doc_tokens if token in model.wv], axis=0) for doc_tokens in df_unseen["Content"]]

        # Calculate cosine similarity 
        similarity_matrix = cosine_similarity(train_embeddings, unseen_embeddings)

        # Calculate mean similarity for each row (i.e.; via test dataset)
        mean_similarity = np.mean(similarity_matrix, axis=0)
        

   # Determine data drift and print document information
    messages = []
    for doc_idx, similarity_score in enumerate(mean_similarity):
        is_drift = similarity_score <= threshold
        drift_status = "𝗗𝗮𝘁𝗮 𝗗𝗿𝗶𝗳𝘁" if is_drift else "No Data Drift"
        similarity_str = "{:}".format(similarity_score)  
        message = f"Document {doc_idx + 1:<4}: Mean Similarity = {similarity_str:<20} {drift_status}"
        messages.append(message)

    result = (messages)

    return result