In [1]:
import pandas as pd
import re
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', None)

from google.cloud import bigquery

In [2]:
client = bigquery.Client()

query="""
SELECT * FROM goldenfleece.sentiment.gkg_apr_2020_sample 
"""
df = client.query(query).to_dataframe()

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 924653 entries, 0 to 924652
Data columns (total 10 columns):
 #   Column                      Non-Null Count   Dtype 
---  ------                      --------------   ----- 
 0   GKGRECORDID                 924653 non-null  object
 1   DATE                        924653 non-null  object
 2   V2Themes                    924653 non-null  object
 3   TONE                        924653 non-null  object
 4   POSITIVE_SCORE              924653 non-null  object
 5   NEGATIVE_SCORE              924653 non-null  object
 6   POLARITY                    924653 non-null  object
 7   ACTIVITY_REFERENCE_DENSITY  924653 non-null  object
 8   SELF_REFERENCE_DENSITY      924653 non-null  object
 9   WORD_COUNT                  924653 non-null  object
dtypes: object(10)
memory usage: 70.5+ MB


In [4]:
def pre_process_themes(df):
    # drop missing themes rows
    df = df.dropna(subset=["V2Themes"])
    # split by ';' and remove everything after ',' 
    return [[re.sub(r',.*', '', theme) for theme in doc.split(';') if theme] for doc in df["V2Themes"].tolist()]

In [5]:
from gensim.corpora import Dictionary

# get the processed corpus
processed_corpus = pre_process_themes(df)

# load the training dictionary
dictionary = Dictionary.load("dictionary")

print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(processed_corpus))

Number of unique tokens: 13376
Number of documents: 924653


In [6]:
# bag-of-words representation of the documents
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_corpus]

# sense check first and fifth corpus
print(f'Bag-of-words representation of the first document: {bow_corpus[0]}')
print(f'Bag-of-words representation of the fifth document: {bow_corpus[4]}')

Bag-of-words representation of the first document: [(711, 2)]
Bag-of-words representation of the fifth document: [(57, 1), (86, 1), (87, 2), (115, 1), (120, 1), (127, 1), (135, 8), (137, 8), (139, 3), (178, 3), (202, 1), (282, 2), (284, 2), (319, 1), (321, 2), (350, 1), (357, 1), (434, 1), (438, 1), (447, 1), (569, 1), (676, 2), (713, 1), (933, 1), (1000, 3), (1134, 1), (1135, 1), (1136, 1), (1137, 1), (1263, 3), (1458, 3), (1772, 3), (1878, 1), (1925, 1), (3351, 1), (3368, 1), (3369, 1)]


In [7]:
from gensim.models import LdaMulticore

# load pre-trained model
model = LdaMulticore.load("lda_model")

In [8]:
def predict(model, bow_corpus):
    result = {}
    for i, row in enumerate(model[bow_corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        dominant_topic = row[0][0]
        result[i] = dominant_topic + 1
    return pd.Series(result, name="topic")    

topic = predict(model, bow_corpus)
topic_df = df.merge(topic, left_index=True, right_index=True)
assert len(topic) == len(topic_df)

In [14]:
topic_df = topic_df.drop("V2Themes", axis=1)
topic_df.info(memory_usage="deep")
topic_df.to_csv("../data/gkg_apr_2020_sample_topic.csv", index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 924653 entries, 0 to 924652
Data columns (total 10 columns):
 #   Column                      Non-Null Count   Dtype 
---  ------                      --------------   ----- 
 0   GKGRECORDID                 924653 non-null  object
 1   DATE                        924653 non-null  object
 2   TONE                        924653 non-null  object
 3   POSITIVE_SCORE              924653 non-null  object
 4   NEGATIVE_SCORE              924653 non-null  object
 5   POLARITY                    924653 non-null  object
 6   ACTIVITY_REFERENCE_DENSITY  924653 non-null  object
 7   SELF_REFERENCE_DENSITY      924653 non-null  object
 8   WORD_COUNT                  924653 non-null  object
 9   topic                       924653 non-null  int64 
dtypes: int64(1), object(9)
memory usage: 800.7 MB
