Prototype finished.

# Topic Prototyping

Prototype code for Topic Modeling of Posts. Objective is to assign topics of the most promising model to the Posts.

In [1]:
import os, sys
sys.path.append("..")
from config import credentials
import dropbox

import numpy as np
import pandas as pd

from joblib import dump, load
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
random_state = 23

## Loading

In [3]:
data_path = "/Data/CSVData"
model_path = "../models"

In [4]:
team_dbx = dropbox.DropboxTeam(credentials.dropbox_team_access_token)
team_root = team_dbx.with_path_root(dropbox.common.PathRoot.namespace_id(
    credentials.dropbox_team_namespace_id))
user_dbx = team_root.as_user(credentials.dropbox_team_member_id)

posts_fpath = os.path.join(data_path, "PolPosts.csv")

_, res = user_dbx.files_download(posts_fpath)
raw_posts = pd.read_csv(res.raw)
print("Posts", raw_posts.shape)

Posts (27204, 23)


## Preprocessing

Dataset cleansing

In [5]:
# Remove unnecessary cols and rename them uniformly
posts_cols = raw_posts.columns.to_list()
posts_cols.remove("text")
posts_cols.remove("textID") # keep these cols
raw_posts.drop(posts_cols, axis=1, inplace=True)
raw_posts.columns = ["text", "textID"]

corpus = raw_posts.copy()

# Remove empty texts
corpus.text.replace("", np.nan, inplace=True)
corpus.dropna(subset=["text"], inplace=True)
corpus = corpus[~corpus.text.str.isspace()]

# Remove duplicated texts
corpus.drop_duplicates(subset=["text"], keep="first", inplace=True)

corpus.shape

(22534, 2)

## Topic Assignment

TFIDF + LDA

In [6]:
data = corpus.copy()

### Vectorizer

In [7]:
tfidf = load(os.path.join(model_path, "topic_vectorizer", "tfidf.joblib"))
tfidf_v = load(os.path.join(model_path, "topic_vectorizer", "tfidf_v.joblib"))

In [8]:
len(tfidf.vocabulary_)

40000

### Model

In [None]:
model = load(os.path.join(model_path, "topic_lda", "lda_40.joblib"))   

# Transform
doc_topic_distr = model.transform(tfidf_v)

### Assign Topics

In [None]:
data["topic_distribution"] = doc_topic_distr.tolist()
data["topic"] = doc_topic_distr.argmax(axis=1)

### Persist Table

In [None]:
persist_fpath = os.path.join(data_path, "topics", "topics_n40.csv")
user_dbx.files_upload(bytes(data.to_csv(index=False), "utf-8"),
                      persist_fpath, mode=dropbox.files.WriteMode.overwrite)

# Conclusion

Topics of the trained model has been successfully assigned to Posts. Prototype can be scriptified.