In [66]:
!pip install wordcloud
!pip install nltk



# Packages and Imports

In [90]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import sys, os
import pandas as pd
import numpy as np
import nltk
from string import digits
import re
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
from PIL import Image

In [91]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/yyd/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/yyd/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [92]:
datapath = "/Users/yyd/Documents/GitHub/RedditIPO-SentimentAnalysis/Data"
os.chdir(datapath)

In [93]:
#reading text
comments = pd.read_csv('normalized_sentiment_score.csv')

In [94]:
#filter out negatives
comments=comments[comments.sentiment_adjusted<0]


## A few check points

In [95]:
comments.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,comment_id,comment_parent_id,comment_body,comment_link_id,comment_score,comment_subreddit,fixed_body,sentiment,sentiment_adjusted,subcomment_count,subcomment_weight,upvote_weight,weighted_sentiment_score
0,0,0,0,0,0,hor72d8,t3_rhlfnx,Deleting all comments and accounts in 3... 2.....,t3_rhlfnx,4,stockmarket,Deleting all comments and accounts in 3... 2.....,1,-2,1.0,0.023256,0.007409,-2.061675
2,2,2,2,2,2,horc0ki,t1_hor72d8,"You joke. But, just another social media site ...",t3_rhlfnx,2,stockmarket,"You joke. But, just another social media site ...",2,-1,1.0,0.023256,0.007174,-1.030597
3,3,3,3,3,3,horebg5,t1_horc0ki,"I only joke for now. Once it really happens, I...",t3_rhlfnx,2,stockmarket,"I only joke for now. Once it really happens, I...",1,-2,1.0,0.023256,0.007174,-2.061193
5,5,5,5,5,5,hp8y40y,t1_hp8mmwv,In what way?,t3_rhlfnx,1,stockmarket,In what way?,1,-2,0.0,0.0,0.007056,-2.014113
8,8,8,8,8,8,hosnjyl,t3_rhn77f,It will not have ther momentum of Donald Trump...,t3_rhn77f,-2,stockmarket,It will not have ther momentum of Donald Trump...,2,-1,0.0,0.0,0.006704,-1.006704


# Trim dataframe to list & dictionaries

In [96]:
comments_trimm = comments[['comment_body']]

In [97]:
comments_trimm.head()

Unnamed: 0,comment_body
0,Deleting all comments and accounts in 3... 2.....
2,"You joke. But, just another social media site ..."
3,"I only joke for now. Once it really happens, I..."
5,In what way?
8,It will not have ther momentum of Donald Trump...


In [98]:
# texts
comments_trimm = comments_trimm.to_string()

In [99]:
# Tonkenizer. Lemmatizer is used here to achieve full morphological analysis and accurately identify the lemma for each word, this is better than simply stemming. 
WNL = nltk.WordNetLemmatizer()

# Functions

In [100]:
# function to clean up texts by transforming to lower case, also removing apostrophe
def lowerRep(text):
    text = text.lower()
    text = text.replace("'", "")
    return text

# removing number
def removeNum(text):
    remove_digits = str.maketrans('', '', digits)
    text = text.translate(remove_digits)
    return text

# remove extra chars and stop words
def removeChar(text):
    tokens = nltk.word_tokenize(text)
    text1 = nltk.Text(tokens)
    text_content = [''.join(re.split("[ .,;:!?‘’``''@#$%^_&*()<>{}~\n\t\\\-]", word)) for word in text1]
    return text_content

# remove URLs and images
def removeURL(text):
    text = re.sub(r'http\S+', '', text)
    return text

# remove ![img] string
def removeImg(text):
    text = re.sub(r'!\[img]\S+', '', text)
    text = re.sub(r'emote', '', text)
    return text

# initial removal of stopwords and empty spaces
def removeSpace(text, stopwords):
    text = [word for word in text if word not in stopwords]
    text = [s for s in text if len(s) != 0]
    return text

# lemmatize
def lemmatize(text):
    text = [WNL.lemmatize(t) for t in text]
    return text

# tokenize
def tokenize(text):
    tokens = [nltk.word_tokenize(i) for i in text]
    return tokens

# bigrams dictionary
def bigramDict(text):
    bigrams_list = list(nltk.bigrams(text))
    dictionary = [' '.join(tup) for tup in bigrams_list]
    return dictionary

# generate words frequency
def wordsFreq(text):
    vectorizer = CountVectorizer(ngram_range=(2, 2))
    bag_of_words = vectorizer.fit_transform(text)
    vectorizer.vocabulary_
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq

## Stop Words

In [101]:
# stop words
comments_stopwords = list(STOPWORDS) + ["n", "nbsp", "http", "u", "s", "reddit", "will", "tth", "emote", 
                                       "gon", "na", "wan", 'na', "emote freeemotespack", "emote", "freeemotespack", 
                                        "and or", "or", "total", "submission", "comment", "buy", "got", "ta"]

In [102]:
# test
# string1 = "![img](emote|t5_2th52|4258)"
# string2 = "https://github.com/yyd859/RedditIPO-SentimentAnalysis"

In [103]:
# test
# removeImg(string1)

In [104]:
# test
# removeURL(string2)

# Pre-Processing

In [105]:
# pre-processsing for comments: this should take longer
comments_trimm = removeURL(comments_trimm)
comments_trimm = removeImg(comments_trimm)
comments_trimm = lowerRep(comments_trimm)
comments_trimm = removeNum(comments_trimm)
comments_content = removeChar(comments_trimm)
comments_content = removeSpace(comments_content, comments_stopwords)
comments_content = lemmatize(comments_content)

## Tokenization & Vectorization

In [106]:
# tokenize and generate bigram for comments
comments_token = tokenize(comments_content)
comments_bigram = bigramDict(comments_content)

In [107]:
# test
# print(posts_token)
# print(posts_bigram)

In [108]:
# test
# print(posts_word_freq)

In [109]:
# generate frequency for each bigram for comments
comments_word_freq = wordsFreq(comments_bigram)

In [110]:
comments_dict= dict(comments_word_freq)

In [111]:
comments_word_freq

[('social medium', 137),
 ('go public', 83),
 ('make money', 63),
 ('going public', 59),
 ('right now', 58),
 ('dont think', 49),
 ('dont know', 47),
 ('civil war', 41),
 ('year ago', 40),
 ('click vote', 38),
 ('front page', 37),
 ('medium platform', 36),
 ('message compose', 32),
 ('and or', 32),
 ('last year', 31),
 ('must nwe', 31),
 ('nwe must', 31),
 ('dont see', 30),
 ('pump dump', 29),
 ('free speech', 29),
 ('go back', 27),
 ('wall street', 25),
 ('nsfw content', 25),
 ('make sense', 24),
 ('hedge fund', 24),
 ('next year', 24),
 ('user base', 24),
 ('one day', 23),
 ('echo chamber', 22),
 ('im going', 21),
 ('publicly traded', 21),
 ('aaron swartz', 21),
 ('lot people', 21),
 ('im sure', 21),
 ('user report', 20),
 ('seen wsb', 20),
 ('dont even', 20),
 ('good luck', 20),
 ('every single', 20),
 ('people dont', 20),
 ('first seen', 19),
 ('agon comment', 19),
 ('previous dd', 19),
 ('account age', 19),
 ('spam new', 19),
 ('approve new', 19),
 ('many people', 19),
 ('loss por

# Word Cloud Generation

In [112]:
wc_comments = WordCloud(
        background_color = 'white', 
        stopwords = comments_stopwords, 
        height = 400, 
        width = 600)

In [113]:
wc_comments.generate_from_frequencies(comments_dict)

<wordcloud.wordcloud.WordCloud at 0x7f8611c8d880>

# Outputs

In [114]:
# image_colors = wc_posts.ImageColorGenerator(mask)
# wc_posts.recolor(color_func=image_colors)

In [115]:
# plt.imshow(wc_posts, interpolation='bilinear')
# plt.axis('off')
# plt.show()

In [116]:
wc_comments.to_file('wc_negative_bigram.png')

<wordcloud.wordcloud.WordCloud at 0x7f8611c8d880>