In [None]:
# Tutorial By: Sashank Kapadia
# Topic Modeling in Python: Latent Dirichlet Allocation (LDA)
# https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

In [None]:
import pandas as pd
import os

# Instantiate a dataframe using RedditData
df = pd.read_csv('RedditData.csv')

In [None]:
import re

# Clean data by removing symbols and puncuations
df['titles_processed'] = \
df['title'].map(lambda x: re.sub('[,\.!?]', '', x))

# Convert the titles to lowercase
df['titles_processed'] = \
df['titles_processed'].map(lambda x: x.lower())

In [None]:
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Load in NLTK Stopwords
stop_words = stopwords.words('english')

# Stopwords added after viewing results
stop_words.extend(['help', 'get', 'make', 'use', 'good', 'bitcoin', 'btc', 'crypto',
                  'have', 'go', 'do', 'new', 'say', 'want', 'time', 'year', 'more', 'know', 'free', 'first', 'think', 'real',
                  'would', 'need', 'https', 'us', 'anyone', 'el', 'one', 'question'])
def sent_to_words(sentences):
    for sentence in sentences:
        # Create list of lowercased words from the sentence
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    # Return words to list if word is not in stopwords
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
# Create list object from new df column
data = df.titles_processed.values.tolist()
data_words = list(sent_to_words(data))
# Remove stopwords from list
data_words = remove_stopwords(data_words)

In [None]:
import gensim.corpora as corpora
# Create the dictionary using data_words
id2word = corpora.Dictionary(data_words)
texts = data_words
# Create a list that sets an id to word and frequency of that term
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
# Build the LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word = id2word, 
                                            num_topics=15, random_state = 20, update_every = 1,
                                            chunksize=1000, passes = 25, alpha = "auto")

In [None]:
# Visualizing Data

import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

# Allow to run in-line and see visualization in Jyptiter
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)

# Top 15 words
vis

In [None]:
# Show the LDA Model
lda_model

In [None]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np

In [None]:
# Create a single string using the titles in the RedditData csv file

text = ""

for i in df.title:
    text += i + ","

In [None]:
# Font can be found here:
# https://www.dafont.com/techfont.font
font_path = 'C:/Users/guest1/Desktop/techfont/Techfont.ttf'

# Bitcoin jpg can be found here:
# https://icon-library.com/icon/bitcoin-icon-27.html
mask = np.array(Image.open('C:/Users/guest1/Pictures/bitcoin-icon-27.jpg'))

In [None]:
# Create the wordcloud object. 
wc = WordCloud(stopwords=stop_words, font_path=font_path,
               mask=mask, background_color="white",
               max_words=2000, max_font_size=256)
# Generate the WC object with created string
wc.generate(text)
# Increase the size of the figure to be printed
plt.figure(figsize=(20,10))

# https://matplotlib.org/stable/gallery/images_contours_and_fields/interpolation_methods.html
# interpolation manipulates how the text is displayed on the image
plt.imshow(wc, interpolation="nearest")
# Make the axis invisible
plt.axis('off')
# Set title of the image
plt.title("/r/Bitcoin", fontsize = 48, color="Green", pad=20)

# Send it
plt.show()