In [26]:
!pip install wordcloud
!pip install nltk



# Packages and Imports

In [46]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import sys, os
import pandas as pd
import numpy as np
import nltk
from string import digits
import re
from sklearn.feature_extraction.text import CountVectorizer

In [47]:
datapath = "/Users/yyd/Documents/GitHub/RedditIPO-SentimentAnalysis/Data"
os.chdir(datapath)

In [48]:
#reading text
comments = pd.read_csv('normalized_sentiment_score.csv')
#filter out negatives
comments=comments[comments.sentiment_adjusted<0]

In [49]:
x, y = np.ogrid[:300, :300]
mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
mask = 255 * mask.astype(int)

# Trim dataframe to list & dictionaries

In [50]:
comments_trimm = comments[['comment_body']]

In [51]:
# texts
comments_trimm = comments_trimm.to_string()

In [52]:
# Tonkenizer. Lemmatizer is used here to achieve full morphological analysis and accurately identify the lemma for each word, this is better than simply stemming. 
WNL = nltk.WordNetLemmatizer()

# Functions

In [53]:
# function to clean up texts by transforming to lower case, also removing apostrophe
def lowerRep(text):
    text = text.lower()
    text = text.replace("'", "")
    return text

# removing number
def removeNum(text):
    remove_digits = str.maketrans('', '', digits)
    text = text.translate(remove_digits)
    return text

# remove extra chars and stop words
def removeChar(text):
    tokens = nltk.word_tokenize(text)
    text1 = nltk.Text(tokens)
    text_content = [''.join(re.split("[ .,;:!?‘’``''@#$%^_&*()<>{}~\n\t\\\-]", word)) for word in text1]
    return text_content

# remove URLs and images
def removeURL(text):
    text = re.sub(r'http\S+', '', text)
    return text

# remove ![img] string
def removeImg(text):
    text = re.sub(r'!\[img]\S+', '', text)
    return text

# initial removal of stopwords and empty spaces
def removeSpace(text, stopwords):
    text = [word for word in text if word not in stopwords]
    text = [s for s in text if len(s) != 0]
    return text

# lemmatize
def lemmatize(text):
    text = [WNL.lemmatize(t) for t in text]
    return text

# tokenize
def tokenize(text):
    tokens = [nltk.word_tokenize(i) for i in text]
    return tokens

# sigle word dictionary
def singleDict(text):
    dictionary = [' '.join(tup) for tup in text]
    return dictionary

# generate words frequency
def wordsFreq(text):
    vectorizer = CountVectorizer()
    bag_of_words = vectorizer.fit_transform(text)
    vectorizer.vocabulary_
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq

## Stop Words

In [54]:
# stop words
comments_stopwords = list(STOPWORDS) + ["n", "nbsp", "http", "u", "s", "reddit", "will", "tth", "emote", "nn", 
                                       "im", "one", "people"]

In [55]:
# test
# string1 = "![img](emote|t5_2th52|4258)"
# string2 = "https://github.com/yyd859/RedditIPO-SentimentAnalysis"

In [56]:
# removeImg(string1)

In [57]:
# removeURL(string2)

# Pre-Processing

In [58]:
# pre-processsing for comments: this should take longer
comments_trimm = removeURL(comments_trimm)
comments_trimm = removeImg(comments_trimm)
comments_trimm = lowerRep(comments_trimm)
comments_trimm = removeNum(comments_trimm)
comments_content = removeChar(comments_trimm)
comments_content = removeSpace(comments_content, comments_stopwords)
comments_content = lemmatize(comments_content)

## Tokenization & Vectorization

In [None]:
# tokenize and generate dictionary for comments
comments_token = tokenize(comments_content)
comments_single = singleDict(nltk.Text(comments_token))

In [42]:
# test block
# print(posts_token)
# print(posts_bigram)

In [43]:
# test block
# print(posts_word_freq)

In [44]:
# generate frequency for comments
comments_word_freq = wordsFreq(comments_single)

In [45]:
comments_word_freq

[('people', 777),
 ('buy', 556),
 ('one', 537),
 ('go', 530),
 ('dont', 530),
 ('ipo', 526),
 ('year', 525),
 ('sub', 506),
 ('think', 503),
 ('going', 494),
 ('money', 473),
 ('make', 471),
 ('now', 431),
 ('company', 430),
 ('im', 429),
 ('shit', 419),
 ('know', 417),
 ('even', 411),
 ('time', 406),
 ('thing', 384),
 ('user', 367),
 ('see', 365),
 ('way', 338),
 ('need', 332),
 ('want', 327),
 ('moon', 322),
 ('post', 318),
 ('stock', 302),
 ('fuck', 299),
 ('already', 296),
 ('much', 294),
 ('day', 289),
 ('new', 283),
 ('right', 279),
 ('really', 276),
 ('something', 273),
 ('site', 269),
 ('share', 265),
 ('got', 265),
 ('take', 264),
 ('public', 264),
 ('platform', 263),
 ('good', 258),
 ('say', 253),
 ('back', 252),
 ('ad', 245),
 ('still', 243),
 ('don', 242),
 ('comment', 234),
 ('lol', 230),
 ('mean', 227),
 ('gme', 225),
 ('mod', 225),
 ('account', 220),
 ('lot', 219),
 ('work', 219),
 ('thats', 213),
 ('never', 210),
 ('put', 207),
 ('sell', 204),
 ('na', 203),
 ('first', 2

In [19]:
comments_dict = dict(comments_word_freq)

# Word Cloud Generation

In [20]:
wc_comments = WordCloud(
        background_color = 'white', 
        stopwords = comments_stopwords, 
        mask = mask)

In [22]:
wc_comments.generate_from_frequencies(comments_dict)

<wordcloud.wordcloud.WordCloud at 0x7f92700692b0>

# Outputs

In [25]:
wc_comments.to_file('wc_negative_single.png')

<wordcloud.wordcloud.WordCloud at 0x7f92700692b0>