In [6]:
#standard library imports
from collections import Counter
import copy
import itertools
from operator import itemgetter
import os
import os.path
from pathlib import Path
from typing import NamedTuple
import glob
from os import access, R_OK
import re
from operator import attrgetter
import csv

#third party imports
from bertopic import BERTopic
import gensim
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import numpy as np
import pandas as pd
import sklearn.base
from sklearn.model_selection import ParameterGrid
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    pipeline,
)
import joblib
from zipfile import ZipFile

In [15]:
import google

In [None]:
PHASE_NUMBER = 1
PHASE_TITLE = 'congress'

def write_csv(df, csv_path: str):
  if not csv_path.endswith(".csv"):
    raise ValueError("Please provide valid csv name for topic csv")
    df.to_csv(csv_path, index=None)

def given_a_topic_list_original_statements(dataframe ,topic_number: int, cluster_column_name: str = None):
    if cluster_column_name is None:
        return dataframe[dataframe['clusters'] == topic_number]
    return dataframe[dataframe[cluster_column_name] == topic_number]

def get_docs_from_df(dataframe, columns: list=None):
  
  if columns is None:
    columns = ["combined_responses"]

  docs = []
  for column in columns:
    column_to_list = dataframe[column].tolist()
    column_to_list = [doc.lower().strip() for doc in column_to_list]
    # columns_to_list = [doc for doc in column_to_list if doc != '']
    docs.extend(column_to_list)
  
  return docs

def get_text_for_cluster(list_of_sentences):
    text = ""
    for sentence in list_of_sentences:
        text = text + " " + sentence

    return text

def given_cluster_filter_dataset(df, cluster_id: int):
    return df[df['clusters'] == cluster_id]

def _get_model_path_2(folder_path: str, file_name:str):
    return os.path.join(folder_path, "{}_Phase{}_{}.zip".format(file_name, PHASE_NUMBER, PHASE_TITLE))

def _get_csv_path(folder_path: str, file_name:str):
    return os.path.join(folder_path, "{}.csv".format(file_name))

In [None]:

#This path should be to a folder that will hold all of your models and csvs
PHASE_DIR = Path("/content/drive/MyDrive/tradoc/Phase_{}".format(PHASE_NUMBER))
assert PHASE_DIR.is_dir()
#this path appends a folder inside of your PHASE_DIR for  inputs and outputs.
PHASE_DATASET_DIR = Path("/content/drive/MyDrive/tradoc/Phase_{}/Datasets".format(PHASE_NUMBER))
assert PHASE_DATASET_DIR.exists()
#This path appends a folder inside of your PHASE_DIR for saving and loading model files.
MODEL_SAVE_PATH = Path('/content/drive/MyDrive/tradoc/Phase_{}/Models/{}'.format(PHASE_NUMBER, PHASE_TITLE))
MODEL_LOAD_PATH = Path('/content/drive/MyDrive/tradoc/Phase_{}/Models/{}'.format(PHASE_NUMBER, PHASE_TITLE))

assert MODEL_SAVE_PATH.is_dir()
assert MODEL_LOAD_PATH.is_dir()
#This path is for a separate folder in your PHASE_DIR for 
TOPIC_SAVE_PATH = Path('/content/drive/MyDrive/tradoc/Phase_{}/Datasets'.format(PHASE_NUMBER))
TOPIC_LOAD_PATH = Path('/content/drive/MyDrive/tradoc/Phase_{}/Datasets/'.format(PHASE_NUMBER))
assert MODEL_SAVE_PATH.is_dir()
assert MODEL_LOAD_PATH.is_dir()

#DO NOT SET THIS TO FALSE
COMPUTE_PROBABILITIES=True

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 0)


In [None]:

#This csv leads to data from InsightAI. If you are pulling from DB, you can parse with this notebook: https://drive.google.com/file/d/1nguZ3vEBPzrMvvrDkYVRGxeXSakT33LM/view?usp=sharing
PHASE_DATA_PATH = '/content/drive/MyDrive/tradoc/Phase_7/Datasets/tradoc-78-detailed-data-from-Sanjay-with-company.csv'


#this path is for the file that will be saved with the original statements and their associated topics
PHASE_DATA_WITH_CLUSTERS_PATH = str(PHASE_DATASET_DIR)  + "/" +'phase_{}_data_with_clusters_blockers.csv'.format(PHASE_NUMBER)

In [None]:

#loading the parsed data as a DataFrame
phase_df = pd.read_csv(PHASE_DATA_PATH, na_filter=False)
phase_df.replace(np.nan, "", inplace=True)

In [None]:
# Combine multiple separate columns to perform a single set of modeling
# For example, if we want to model `problems` and `background`, enter them as a list and both columns will be concatenated together for modeling.
list_of_columns_to_model_individually = ['basic_need']

In [None]:
# We combine the problems and background columns because they are 2 parts of the same question
phase_df['combined_responses'] = phase_df[list_of_columns_to_model_individually].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)


In [None]:
# Keep the combined columns only
X = phase_df['combined_responses']

In [None]:
topic_model = BERTopic(nr_topics='auto', n_gram_range=(1,5), calculate_probabilities=True)

In [None]:
def create_and_save_models_and_topics(
    data_df: pd.DataFrame,
    model_save_path: str,
    topic_save_path: str,
    col_name: str = "speech",
    calculate_probabilities: bool = True
):
  if col_name not in data_df.columns:
     raise ValueError("Please give a valid column name and/or dataframe")

  model_names = [
    f"topic_model_for{congress_num}_speeches_orig_spell"
    for congress_num in range(43, 112)            
  ]

  for model_name in model_names:

    try:
      bt_model = BERTopic(nr_topics='auto', n_gram_range=(1,5), calculate_probabilities=True)
      bt_model.fit(X)
      topics, probabilities = bt_model.transform(X)

      model_full_name = str(model_name) + "_Phase{}_{}.zip".format(PHASE_NUMBER, PHASE_TITLE)
      bt_model.save(model_full_name)
      print('something got saved in content!')
      print(model_full_name)
    except Exception as e:
      print(e)
      print(model_name)

In [None]:
create_and_save_models_and_topics(data_df=phase_df, model_save_path=MODEL_SAVE_PATH, topic_save_path=TOPIC_SAVE_PATH)