In [1]:
!pip install wordcloud
!pip install nltk



# Packages and Imports

In [2]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import sys, os
import pandas as pd
import numpy as np
import nltk
from string import digits
import re
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
datapath = "C:/Users/kevin/OneDrive/Winter Case Competition/RedditIPO-SentimentAnalysis/Data"
os.chdir(datapath)

In [4]:
#reading text
posts = pd.read_csv('posts_rd.csv')
comments = pd.read_csv('comments_rd.csv')

In [5]:
x, y = np.ogrid[:600, :600]
mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
mask = 255 * mask.astype(int)

# Trim dataframe to list & dictionaries

In [6]:
posts_trimm = posts[['title', 'body']]
comments_trimm = comments[['comment_body']]

In [7]:
# texts
posts_trimm = posts_trimm.to_string()
comments_trimm = comments_trimm.to_string()

In [8]:
# Tonkenizer. Lemmatizer is used here to achieve full morphological analysis and accurately identify the lemma for each word, this is better than simply stemming. 
WNL = nltk.WordNetLemmatizer()

# Functions

In [9]:
# function to clean up texts by transforming to lower case, also removing apostrophe
def lowerRep(text):
    text = text.lower()
    text = text.replace("'", "")
    return text

# removing number
def removeNum(text):
    remove_digits = str.maketrans('', '', digits)
    text = text.translate(remove_digits)
    return text

# remove extra chars and stop words
def removeChar(text):
    tokens = nltk.word_tokenize(text)
    text1 = nltk.Text(tokens)
    text_content = [''.join(re.split("[ .,;:!?‘’``''@#$%^_&*()<>{}~\n\t\\\-]", word)) for word in text1]
    return text_content

# remove URLs and images
def removeURL(text):
    text = re.sub(r'http\S+', '', text)
    return text

# remove ![img] string
def removeImg(text):
    text = re.sub(r'!\[img]\S+', '', text)
    return text

# initial removal of stopwords and empty spaces
def removeSpace(text, stopwords):
    text = [word for word in text if word not in stopwords]
    text = [s for s in text if len(s) != 0]
    return text

# lemmatize
def lemmatize(text):
    text = [WNL.lemmatize(t) for t in text]
    return text

# tokenize
def tokenize(text):
    tokens = [nltk.word_tokenize(i) for i in text]
    return tokens

# sigle word dictionary
def singleDict(text):
    dictionary = [' '.join(tup) for tup in text]
    return dictionary

# generate words frequency
def wordsFreq(text):
    vectorizer = CountVectorizer()
    bag_of_words = vectorizer.fit_transform(text)
    vectorizer.vocabulary_
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq

## Stop Words

In [10]:
# stop words
posts_stopwords = list(STOPWORDS) + ["https", "png", "imgur", "n", "click", "reddit", "ashx", "will", "CHART", 
                                     "t", "ta", "st_c", "sma", "smsch_200p", "bb_20_2", "webp", "s", "l", 
                                    "stofu_b_14_3_3", "macd_b_12_26_9", "rsi_b_14", "sch_200p", "p", "d", "c", "nn"]
comments_stopwords = list(STOPWORDS) + ["n", "nbsp", "http", "u", "s", "reddit", "will", "tth", "emote", "nn"]

In [11]:
# test
# string1 = "![img](emote|t5_2th52|4258)"
# string2 = "https://github.com/yyd859/RedditIPO-SentimentAnalysis"

In [12]:
# removeImg(string1)

In [13]:
# removeURL(string2)

# Pre-Processing

In [14]:
# pre-processsing for posts
posts_trimm = removeURL(posts_trimm)
posts_trimm = removeImg(posts_trimm)
posts_trimm = lowerRep(posts_trimm)
posts_trimm = removeNum(posts_trimm)
posts_content = removeChar(posts_trimm)
posts_content = removeSpace(posts_content, posts_stopwords)
posts_content = lemmatize(posts_content)

In [15]:
# pre-processsing for comments: this should take longer
comments_trimm = removeURL(comments_trimm)
comments_trimm = removeImg(comments_trimm)
comments_trimm = lowerRep(comments_trimm)
comments_trimm = removeNum(comments_trimm)
comments_content = removeChar(comments_trimm)
comments_content = removeSpace(comments_content, comments_stopwords)
comments_content = lemmatize(comments_content)

## Tokenization & Vectorization

In [16]:
# tokenize and generate dictionary for posts
posts_token = tokenize(posts_content)
# text2 = nltk.Text(posts_token)
posts_single = singleDict(nltk.Text(posts_token))

In [22]:
# tokenize and generate dictionary for comments
comments_token = tokenize(comments_content)
comments_single = singleDict(nltk.Text(comments_token))

In [18]:
# test block
# print(posts_token)
# print(posts_bigram)

In [19]:
# generate frequency for posts
posts_word_freq = wordsFreq(posts_single)

In [20]:
# test block
# print(posts_word_freq)

In [23]:
# generate frequency for comments
comments_word_freq = wordsFreq(comments_single)

In [24]:
posts_dict = dict(posts_word_freq)
comments_dict = dict(comments_word_freq)

# Word Cloud Generation

In [25]:
wc_posts = WordCloud(
        background_color = 'white',
        stopwords = posts_stopwords,
        mask = mask)

In [26]:
wc_comments = WordCloud(
        background_color = 'white', 
        stopwords = comments_stopwords, 
        mask = mask)

In [27]:
wc_posts.generate_from_frequencies(posts_dict)

<wordcloud.wordcloud.WordCloud at 0x1830838ea60>

In [28]:
wc_comments.generate_from_frequencies(comments_dict)

<wordcloud.wordcloud.WordCloud at 0x1830836dfd0>

# Outputs

In [29]:
wc_posts.to_file('wc_post_single_round.png')

<wordcloud.wordcloud.WordCloud at 0x1830838ea60>

In [30]:
wc_comments.to_file('wc_comment_single_round.png')

<wordcloud.wordcloud.WordCloud at 0x1830836dfd0>