In [40]:
!pip install wordcloud
!pip install nltk



# Packages and Imports

In [98]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import sys, os
import pandas as pd
import numpy as np
import nltk
from string import digits
import re
from sklearn.feature_extraction.text import CountVectorizer

In [81]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kevin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [42]:
datapath = "C:/Users/kevin/OneDrive/Winter Case Competition/RedditIPO-SentimentAnalysis/Data"
os.chdir(datapath)

In [43]:
#reading text
posts = pd.read_csv('posts_rd.csv')
comments = pd.read_csv('comments_rd.csv')

## A few check points

In [44]:
posts.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,title,score,id,url,comms_num,created,body
0,0,0,0,(12/27) Monday's Pre-Market Stock Movers & News,60,rpn6u5,https://www.reddit.com/r/StockMarket/comments/...,8,1640611000.0,# Good Monday morning traders and investors of...
1,1,1,1,(12/15) Wednesday's Pre-Market Stock Movers & ...,25,rgzir5,https://www.reddit.com/r/StockMarket/comments/...,17,1639576000.0,#Good morning traders and investors of the r/S...
2,2,2,2,(12/13) Monday's Pre-Market Stock Movers & News,15,rffrcz,https://www.reddit.com/r/StockMarket/comments/...,18,1639402000.0,# Good Monday morning traders and investors of...
3,3,3,3,Reddit Files Confidentially for IPO. As of Aug...,15,rhlfnx,https://www.wsj.com/articles/reddit-files-pape...,16,1639639000.0,
4,4,4,4,(12/14) Tuesday's Pre-Market Stock Movers & News,13,rg793x,https://www.reddit.com/r/StockMarket/comments/...,25,1639488000.0,#Good morning traders and investors of the r/S...


In [45]:
comments.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,comment_id,comment_parent_id,comment_body,comment_link_id,comment_score,comment_subreddit
0,0,0,0,hor72d8,t3_rhlfnx,Deleting all comments and accounts in 3... 2.....,t3_rhlfnx,4,stockmarket
1,1,1,1,horfp4g,t3_rhlfnx,Make the first offering to the users bitch,t3_rhlfnx,2,stockmarket
2,2,2,2,horc0ki,t1_hor72d8,"You joke. But, just another social media site ...",t3_rhlfnx,2,stockmarket
3,3,3,3,horebg5,t1_horc0ki,"I only joke for now. Once it really happens, I...",t3_rhlfnx,2,stockmarket
4,4,4,4,hp8mmwv,t1_horebg5,We already are,t3_rhlfnx,1,stockmarket


# Trim datafram to list & dictionaries

In [46]:
posts_trimm = posts[['title', 'body']]
comments_trimm = comments[['comment_body']]

In [47]:
posts_trimm.head()

Unnamed: 0,title,body
0,(12/27) Monday's Pre-Market Stock Movers & News,# Good Monday morning traders and investors of...
1,(12/15) Wednesday's Pre-Market Stock Movers & ...,#Good morning traders and investors of the r/S...
2,(12/13) Monday's Pre-Market Stock Movers & News,# Good Monday morning traders and investors of...
3,Reddit Files Confidentially for IPO. As of Aug...,
4,(12/14) Tuesday's Pre-Market Stock Movers & News,#Good morning traders and investors of the r/S...


In [48]:
comments_trimm.head()

Unnamed: 0,comment_body
0,Deleting all comments and accounts in 3... 2.....
1,Make the first offering to the users bitch
2,"You joke. But, just another social media site ..."
3,"I only joke for now. Once it really happens, I..."
4,We already are


In [49]:
# texts
posts_trimm = posts_trimm.to_string()
comments_trimm = comments_trimm.to_string()

In [50]:
# Tonkenizer. Lemmatizer is used here to achieve full morphological analysis and accurately identify the lemma for each word, this is better than simply stemming. 
WNL = nltk.WordNetLemmatizer()

# Functions

In [119]:
# function to clean up texts by transforming to lower case, also removing apostrophe
def lowerRep(text):
    text = text.lower()
    text = text.replace("'", "")
    return text

# removing number
def removeNum(text):
    remove_digits = str.maketrans('', '', digits)
    text = text.translate(remove_digits)
    return text

# remove extra chars and stop words
def removeChar(text):
    tokens = nltk.word_tokenize(text)
    text1 = nltk.Text(tokens)
    text_content = [''.join(re.split("[ .,;:!?‘’``''@#$%^_&*()<>{}~\n\t\\\-]", word)) for word in text1]
    return text_content

# remove URLs
def removeURL(text):
    text = re.sub(r'http\S+', '', text)
    return text

# initial removal of stopwords and empty spaces
def removeSpace(text, stopwords):
    text = [word for word in text if word not in stopwords]
    text = [s for s in text if len(s) != 0]
    return text

# lemmatize
def lemmatize(text):
    text = [WNL.lemmatize(t) for t in text]
    return text

# tokenize
def tokenize(text):
    tokens = [nltk.word_tokenize(i) for i in text]
    return tokens

# bigrams dictionary
def bigramDict(text):
    bigrams_list = list(nltk.bigrams(text))
    dictionary = [' '.join(tup) for tup in bigrams_list]
    return dictionary

# generate words frequency
def wordsFreq(text):
    vectorizer = CountVectorizer(ngram_range=(2, 2))
    bag_of_words = vectorizer.fit_transform(text)
    vectorizer.vocabulary_
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq


In [120]:
posts_stopwords = list(STOPWORDS) + ["https", "png", "imgur", "n", "CLICK", "reddit", "ashx", "will", "CHART", 
                                     "t", "ta", "st_c", "sma", "smsch_200p", "bb_20_2", "webp", "s", "l", 
                                    "stofu_b_14_3_3", "macd_b_12_26_9", "rsi_b_14", "sch_200p", "p", "d", "c"]
comments_stopwords = list(STOPWORDS) + ["n", "nbsp", "http", "u", "s", "reddit", "will"]

In [121]:
posts_trimm = removeURL(posts_trimm)
posts_trimm = lowerRep(posts_trimm)
posts_trimm = removeNum(posts_trimm)
posts_content = removeChar(posts_trimm)
posts_content = removeSpace(posts_content, posts_stopwords)
posts_content = lemmatize(posts_content)

In [123]:
posts_token = tokenize(posts_content)
posts_bigram = bigramDict(posts_content)

In [None]:
# test only on posts; do not run for comment!
print(posts_bigram)

In [124]:
posts_word_freq = wordsFreq(posts_bigram)

In [None]:
# test only on posts; do not run for comment!
print(posts_word_freq)

In [126]:
posts_dict = dict(posts_word_freq)

In [127]:
wc_posts = WordCloud(
        background_color = 'white',
        stopwords = posts_stopwords,
        height = 400,
        width = 600)

In [None]:
wc_comments = WordCloud(
        background_color = 'white', 
        stopwords = comments_stopwords, 
        height = 400, 
        width = 600)

In [128]:
wc_posts.generate_from_frequencies(posts_dict)

<wordcloud.wordcloud.WordCloud at 0x2872782e2e0>

In [None]:
wc_comments.generate(comments_trimm)

In [129]:
wc_posts.to_file('postwordcloudtry.png')

<wordcloud.wordcloud.WordCloud at 0x2872782e2e0>

In [None]:
wc_comments.to_file('commentwordcloudtry.png')