[Tutorial](https://github.com/wjbmattingly/topic_modeling_textbook/blob/main/lessons/02_tf_idf_official.py)
* Purpose: Good for getting sense of topics within a corpus without reading it
* TFIDF is usually better for shorter texts, whereas LDA is better for longer texts e.g. > 100 words (can contain more topics)
* TFIDF is also computationally cheaper

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import string
from nltk.corpus import stopwords
import json
import glob
import re

In [5]:
def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data)

def write_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

# Remove the stopwords
def remove_stops(text, stops):
    text = re.sub(r"AC\/\d{1,4}\/\d{1,4}", "", text)
    words = text.split()
    final = []
    for word in words:
        if word not in stops:
            final.append(word)
    final = " ".join(final)
    final = final.translate(str.maketrans("", "", string.punctuation)) # Removes punctuation from the text.
    final = "".join([i for i in final if not i.isdigit()]) # Removes all numbers from text
    while "  " in final:
        final = final.replace("  ", " ")
    return (final)

# Cleans all the documents (removed stop words), and also dates
def clean_docs(docs):
    stops = stopwords.words("english")
    months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
    stops = stops+months
    final = []
    for doc in docs:
        clean_doc = remove_stops(doc, stops)
        final.append(clean_doc)
    return (final)

In [6]:
all_files = ["coinbase.csv", "binance.csv", "ftx.csv", "kraken.csv", "robinhood.csv", "webull.csv", "etoro.csv"]


descriptions = load_data("trc_dn.json")["descriptions"]
names = load_data("trc_dn.json")["names"]
# print (descriptions[0])

cleaned_docs = clean_docs(descriptions)
# print (cleaned_docs[0])

In [7]:
vectorizer = TfidfVectorizer(
                                lowercase=True,
                                max_features=100,
                                max_df=0.8, # Ignores words with frequency > threshold (float = p% of documents that contain word)
                                min_df=5, # Ignore words with frequency < threshold (integer = absolute count i.e. word must occur 5 times)
                                ngram_range = (1,3),
                                stop_words = "english"

                            )

vectors = vectorizer.fit_transform(cleaned_docs) # Returns weighted document word matrix

feature_names = vectorizer.get_feature_names()



In [8]:
#NB: This is unnecessary, just done for us to see the keywords within each description.
dense = vectors.todense()
denselist = dense.tolist()

all_keywords = []

for description in denselist:
    x=0
    keywords = []
    for word in description:
        if word > 0: # Checks if word is actually a word
            keywords.append(feature_names[x])
        x=x+1
    all_keywords.append(keywords)
print (descriptions[0])
print (all_keywords[0])


An ANCYL member who was shot and severely injured by SAP members at Lephoi, Bethulie, Orange Free State (OFS) on 17 April 1991. Police opened fire on a gathering at an ANC supporter's house following a dispute between two neighbours, one of whom was linked to the ANC and the other to the SAP and a councillor.
['anc', 'anc supporters', 'house', 'injured', 'member', 'members', 'police', 'sap', 'severely', 'shot', 'supporters']


In [9]:
true_k = 20

model = KMeans(n_clusters=true_k, init="k-means++", max_iter=100, n_init=1)

model.fit(vectors)

order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()



In [10]:
with open ("trc_results.txt", "w", encoding="utf-8") as f:
    for i in range(true_k):
        f.write(f"Cluster {i}")
        f.write("\n")
        for ind in order_centroids[i, :10]:
            f.write (' %s' % terms[ind],)
            f.write("\n")
        f.write("\n")
        f.write("\n")