## PRELIMINARIES

In [None]:
import os
import string
import re
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 100)
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import plotly.io as pio
pio.renderers.default='iframe'
import hdbscan
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN
from umap import UMAP
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
os.environ["HUGGINGFACE_CO_TIMEOUT"] = "600"  # Increase timeout duration to 600 seconds
os.environ["TOKENIZERS_PARALLELISM"] = "false"

## Create Models

### Define a function to clean the text in each csv file to ensure preprocessed text is properly cleaned.

In [None]:
# Function to uniformly clean the text field of each csv file
def clean_text(text):
    # Remove punctuation
    # text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Remove inverted exclamation and question marks
    text = text.replace("¡", "").replace("¿", "")
    
    # Remove « and »
    text = text.replace("«", "").replace("»", "")
    
    # Remove dashes
    text = text.replace("-", "")
    
    # Remove quote marks
    text = text.replace("'", "").replace('"', "")
    
    # Replace multiple spaces with a single space
    text = re.sub(r"\s+", " ", text)
    
    # Lower case
    # text = text.lower()
    
    # Remove new lines and line breaks
    # text = text.replace("\n", " ").replace("\r", "")
    
    return text

### Define a function to iterate through all the csv files in a directory and model them

In [None]:
def process_csv_file(csv_file_path):
    df = pd.read_csv(csv_file_path)
    #print(df.columns)
    #print(df.head(10))
    

   # Apply the cleaning function to the "text" column
    df["text"] = df["text"].apply(lambda text: clean_text(text))
    print(df.columns)
    print(df.head(10))

#    # Drop the column named "0" if it exists
#     if '0' in df.columns:
#         df = df.drop(columns=['0'])
        
    # Check if the DataFrame has at least one row after cleaning
    if len(df) > 0:
        docs = df['text']

        ctfidf_model = ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)
        hdbscan_model = HDBSCAN(min_cluster_size=7, prediction_data=True)
        umap_model = UMAP(n_neighbors=5, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
        sentence_model = SentenceTransformer("./dfm-sentence-encoder-medium-2")

        embeddings = sentence_model.encode(docs, show_progress_bar=True)

        topic_model = BERTopic(
            language='danish',
            calculate_probabilities=True,
            umap_model=umap_model,
            hdbscan_model=hdbscan_model,
            embedding_model=sentence_model,
            top_n_words=10,
            min_topic_size=10,
            ctfidf_model=ctfidf_model,
            verbose=True
        )

        topics, probs = topic_model.fit_transform(docs, embeddings)

The following lines can be uncommented or commented depending on what one wants to achieve--create non-stopword n-grams, apply an outlier reduction step using thresholded cTFIDF or probabilities, etc.

In [None]:

        # new_topics = topics
        
        # Use the "c-TF-IDF" strategy with a threshold
        # new_topics = topic_model.reduce_outliers(docs, topics, strategy="c-tf-idf", threshold=0.1)

        # Reduce all outliers that are left with the "probabilities" strategy
        # new_topics = topic_model.reduce_outliers(docs, new_topics, strategy="probabilities", probabilities=probs)

        # Apply count vectorizer to remove stopwords and create n-grams in the model
        with open('./danish_stopwords.txt', 'r') as f:
            danish_stop_words = f.read().splitlines()
            vectorizer_model = CountVectorizer(stop_words=danish_stop_words, ngram_range=(1, 4), min_df=0.05)

        # Update topics with the new vectorizer_model
        topic_model.update_topics(docs, topics=topics, vectorizer_model=vectorizer_model)

In [None]:
 
        # Get the filename without the extension
        csv_filename = os.path.splitext(os.path.basename(csv_file_path))[0]

        # Create the output directory if it doesn't exist
        output_dir = './danish_regular_models/'
        os.makedirs(output_dir, exist_ok=True)
        
        # Print first ten topics
        topic_model.get_topic_info()
        
        # Print model name and the number of topics
        num_topics = len(topic_model.get_topic_freq())
        
        # Save the model and embeddings
        print('The model has:', num_topics, "topics.")
        print(f'Saving topic model and embeddings as: {csv_filename}_topic_model and {csv_filename}_embeddings.npy in {output_dir} using pickle')
        embedding_model = "./dfm-sentence-encoder-medium-2"
        # topic_model.save(f'{output_dir}/{csv_filename}_topic_model', serialization="safetensors", save_ctfidf=True, save_embedding_model='./dfm-sentence-encoder-medium-2')
        topic_model.save(f'{output_dir}/{csv_filename}_topic_model', serialization="pickle")
        np.save(f'{output_dir}{csv_filename}_embeddings.npy', embeddings)

        # Print model name and the number of topics
        num_topics = len(topic_model.get_topic_freq())
        
##        get the representative docs from the topic model and save as a dataframe--use for safetensors or pytorch
#         rep_docs_df = topic_model.get_representative_docs()
#         rep_docs_df = pd.DataFrame(rep_docs_df)
#         rep_docs_df.to_csv(f'{output_dir}/{csv_filename}_repdocs.csv',index=False)

### Iterate through the files for processing

In [None]:
# Replace '[directory containing csv files]' with path to CSV files directory
input_directory = './small'
csv_files = [file for file in os.listdir(input_directory) if file.endswith('.csv')]

for csv_file in csv_files:
    csv_file_path = os.path.join(input_directory, csv_file)
    print(f"Now modeling: {csv_file}")
    process_csv_file(csv_file_path)