In [None]:
import pandas as pd
import re
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', None)

from google.cloud import bigquery

In [None]:
client = bigquery.Client()

query="""
SELECT * FROM goldenfleece.sentiment.gkg_apr_2020_sample 
"""
df = client.query(query).to_dataframe()

In [None]:
df.info()

In [None]:
def pre_process_themes(df):
    # drop missing themes rows
    df = df.dropna(subset=["V2Themes"])
    # split by ';' and remove everything after ',' 
    return [[re.sub(r',.*', '', theme) for theme in doc.split(';') if theme] for doc in df["V2Themes"].tolist()]

In [None]:
from gensim.corpora import Dictionary

# get the processed corpus
processed_corpus = pre_process_themes(df)

# load the training dictionary
dictionary = Dictionary.load("dictionary")

print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(processed_corpus))

In [None]:
# bag-of-words representation of the documents
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_corpus]

# sense check first and fifth corpus
print(f'Bag-of-words representation of the first document: {bow_corpus[0]}')
print(f'Bag-of-words representation of the fifth document: {bow_corpus[4]}')

In [None]:
from gensim.models import LdaMulticore

# load pre-trained model
model = LdaMulticore.load("lda_model")

In [None]:
def predict(model, bow_corpus):
    result = {}
    for i, row in enumerate(model[bow_corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        dominant_topic = row[0][0]
        result[i] = dominant_topic + 1
    return pd.Series(result, name="topic")    

topic = predict(model, bow_corpus)
topic_df = df.merge(topic, left_index=True, right_index=True)
assert len(topic) == len(topic_df)

In [None]:
topic_df.info(memory_usage="deep")
# topic_df["DATE"] = pd.to_datetime(topic_df["DATE"])
# topic_df.to_csv("../data/gkg_apr_2020_sample_topic.csv", index=False)