In [None]:
# Imports

import numpy as np
import pandas as pd

from nltk import FreqDist
from nltk.corpus import stopwords
import spacy

import re
from collections import OrderedDict
import itertools
import string

from wordcloud import WordCloud, get_single_color_func

import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
%matplotlib inline

# pd.options.display.max_colwidth = 100
pd.options.display.max_rows = 50

In [None]:
# 1000 new posts of the since 2/14/24
df_new = pd.read_pickle('data/askreddit_new_reddit_data_02_14_2024.pkl')

In [None]:
df_new.head()

In [None]:
df_new['title'].info()

In [None]:
df_new['selftext'].info()

In [None]:
df_new['datetime'].describe()

In [None]:
df_new_nlp = df_new[['id', 'title', 'selftext', 'datetime']]

In [None]:
# Combine the title and body of the post

df_new_nlp['text_comb'] = df_new_nlp['title'] + '. ' + df_new_nlp['selftext']

In [None]:
def clean_string(text):
    
    # Make everything lowercase
    cleaned = text.lower()

    # Remove excessive white space and newlines
    cleaned = cleaned.replace("\n", " ")
    cleaned = re.sub(pattern=r' {2,}', repl=' ', string=cleaned)

    # Ensure apostrophes and quotation marks are consistent
    cleaned = re.sub(r"’|‘", repl="'", string=cleaned)
    cleaned = re.sub(r"“|”", repl='"', string=cleaned)

    # Remove any difficult characters like emojis, elipses, etc. 
    cleaned = ''.join(filter(lambda x: x in string.printable, cleaned))
    
    return cleaned

In [None]:
df_new_nlp['text_comb'] = df_new_nlp['text_comb'].apply(clean_string)

Why not remove punctuation in the cleaning step?

Removing punctuation is a common step in cleaning text for NLP. I don't do it here since I plan to lemmatize words which requires keeping grammatical context in the text. Punctuation will be removed in a future step.

In [None]:
# Setting up Spacy

nlp = spacy.load('en_core_web_sm')

In [None]:
# Combine Spacy and NLTK stop words to make them more comperehensive

sw_spacy = list(nlp.Defaults.stop_words)
sw_nltk = stopwords.words('english')
punct = list(string.punctuation)
stopword_list = list(set(sw_spacy + sw_nltk + punct))

## text_comb column

In [None]:
df_new_nlp['text_comb'].isna().sum()

In [None]:
# Making a Spacy object for each title

df_new_nlp['spacy_text_comb'] = df_new_nlp['text_comb'].apply(lambda x: nlp(clean_string(x)))

In [None]:
# df_new_nlp.sample()['title']

In [None]:
df_new_nlp['text_comb'].loc[770]

In [None]:
# How Spacy's nlp object works:

sample_title = df_new_nlp['title'].loc[770]

display('Spacy Object:')
display(nlp(sample_title))
print('Type:')
print(type(nlp(sample_title)))
print()
print('Tokens:')
print([t.text for t in list(nlp(sample_title))])
print()
print('Lemmas:')
print([t.lemma_ for t in list(nlp(sample_title))])
print()
print('Detect punctuation, numbers, etc.:')
print([t.is_punct for t in list(nlp(sample_title))])
print([t.is_digit for t in list(nlp(sample_title))])

In [None]:
df_new_nlp.head()

### Tokenizing

In [None]:
def spacy_tokenize(spacy_doc):
    """
    Takes in a Spacy doc and converts it to a list of tokens.
    Omits punctuation, non-ASCII characters, digits, URLs.
    """
    token_mask = \
        lambda x: all(
            [x.is_ascii, not x.like_url, not x.is_digit, not x.is_punct]
            )
    tokens = [t.text for t in spacy_doc if token_mask(t)]
    return tokens

In [None]:
df_new_nlp['text_comb_tokens'] = df_new_nlp['spacy_text_comb'].apply(spacy_tokenize)

In [None]:
# Most frequent tokens. Stop words color-coded.

fig, ax = plt.subplots(figsize=(10,5))

top_50_tok = OrderedDict(
    FreqDist(df_new_nlp['text_comb_tokens'].explode()).most_common(50)
    )

tokens = list(top_50_tok.keys())
freq = list(top_50_tok.values())
not_sw = [t for t in tokens if t not in stopword_list]
ax.bar(x=tokens, height=freq, color=['#f14848' if t in not_sw else '#2c2fbf' for t in tokens])
ax.set_ylabel('Frequency', size=10)
ax.set_xlabel('Tokens', size=10)
ax.set_xticklabels(tokens, rotation=45, ha='right')
ax.set_title('Top 50 tokens in r/askreddit\n2/9 to 2/14 2024')
custom_bars = [Rectangle((0,0),1,1,color=c, alpha=1) for c in ['#2c2fbf', '#f14848']]
ax.legend(custom_bars, ['In stop words','Not in stop words'], fontsize=10)
fig.set_facecolor('white')
plt.tight_layout()
# plt.savefig('./images/top_50_tokens2.png', dpi=500)
plt.show()

In [None]:
df_new_nlp['text_comb_tokens_no_sw'] = df_new_nlp['text_comb_tokens'].apply(lambda x: [t for t in x if t not in stopword_list])

In [None]:
# Seeing most frequent tokens without stopwords:

fig, ax = plt.subplots(figsize=(10,5))

top_50_tok = OrderedDict(
    FreqDist(df_new_nlp[df_new_nlp['text_comb_tokens_no_sw'].apply(lambda x: len(x) > 0)]['text_comb_tokens_no_sw'].explode()).most_common(50)
    )

tokens = list(top_50_tok.keys())
freq = list(top_50_tok.values())
ax.bar(x=tokens, height=freq, color='#f14848')
ax.set_xlabel('Tokens', size=10)
ax.set_ylabel('Frequency', size=10)
ax.set_xticklabels(tokens, rotation=45, ha='right')
ax.set_title('Top 50 tokens in r/askreddit (no stop words)\n2/9 to 2/14 2024')
fig.set_facecolor('white')
plt.tight_layout()
# plt.savefig('./images/top_50_tokens_no_sw2.png', dpi=500)
plt.show()

### Lemmas

In [None]:
def spacy_lemmatize(spacy_doc):
    """
    Takes in a Spacy doc and converts it to a list of lemmas.
    Omits punctuation, non-ASCII characters, digits, URLs.
    """
    token_mask = \
        lambda x: all(
            [x.is_ascii, not x.like_url, not x.is_digit, not x.is_punct]
            )
    lemmas = [t.lemma_.lower() for t in spacy_doc if token_mask(t)]
    return lemmas

In [None]:
df_new_nlp['text_comb_lemmas'] = df_new_nlp['spacy_text_comb'].apply(spacy_lemmatize)

In [None]:
df_new_nlp.head()

In [None]:
# Check most common lemmas

FreqDist(df_new_nlp['text_comb_lemmas'].explode()).most_common(20)

### Lemmas without stop words

In order to make the lemmatized tokens without stop words, I will also need to lemmatize the stop words.

This has to be done to the stop words while they are still within the string because SpaCy uses grammatical context to lemmatize.

In [None]:
stopword_list_lem = []

sw_lem_Series = \
    df_new_nlp['spacy_text_comb'].apply(
        lambda x: [t.lemma_.lower() for t in x if t.text.lower() in stopword_list]
    )

for row in sw_lem_Series:
    stopword_list_lem.extend(row)

In [None]:
# Lemmatized tokens, excluding stop words

df_new_nlp['text_comb_lemmas_no_sw'] = df_new_nlp['text_comb_lemmas'].apply(
    lambda x: [l for l in x if l not in stopword_list_lem]
    )

In [None]:
df_new_nlp.head()

## Seeing most frequent lemmas without stopwords:

In [None]:
fig, ax = plt.subplots(figsize=(10,5))

top_50_lem = OrderedDict(
    FreqDist(df_new_nlp[df_new_nlp['text_comb_lemmas_no_sw'].apply(lambda x: len(x) > 0)]['text_comb_lemmas_no_sw'].explode()).most_common(50)
    )

tokens = list(top_50_lem.keys())
freq = list(top_50_lem.values())
ax.bar(x=tokens, height=freq, color='#f14848')
ax.set_xlabel('Lemmas', size=10)
ax.set_ylabel('Frequency', size=10)
ax.set_title('Top 50 lemmas in r/askreddit (no stop words)\n2/9 to 2/14 2024')
ax.set_xticklabels(tokens, rotation=45, ha='right')
fig.set_facecolor('white')
plt.tight_layout()
# plt.savefig('./images/top_50_lemmas_no_sw2.png', dpi=500)
plt.show()

In [None]:
df_new_nlp[df_new_nlp['text_comb_lemmas'].apply(lambda x: 'taxis' in x)].loc[7]['text_comb']

In [None]:
print(df_new_nlp[df_new_nlp['text_comb_lemmas'].apply(lambda x: 'taxis' in x)].loc[7]['text_comb_lemmas'])

**Poking around different frequent words**

In [None]:
df_new_nlp[df_new_nlp['text_comb_tokens'].apply(lambda x: 'code' in x)]

In [None]:
# Where's my refund is a hot topic

df_new_nlp[df_new_nlp['text_comb_tokens'].apply(lambda x: 'wmr' in x)]

### Word Clouds

In [None]:
wc = WordCloud(
    # max_words=500, 
    # random_state=seed, 
    # font_path="./font/NotoSans-Regular.ttf", 
    colormap='Blues',
    # background_color="rgba(255, 255, 255, 0)", mode="RGBA",
    background_color="black", mode="RGBA",
    width=1500,
    height=1000,
    )

In [None]:
fd_tokens = FreqDist(df_new_nlp['text_comb_tokens_no_sw'].explode())

In [None]:
fd_lemmas = FreqDist(df_new_nlp['text_comb_lemmas_no_sw'].explode())

In [None]:
wc.generate_from_frequencies(fd_tokens)
plt.imshow(wc)
plt.axis("off")
plt.title('Frequent Words:\nReddit r/askreddit Titles and Submission\nTop posts of past year (as of 9/15/23)', fontsize=13)
# plt.savefig('./images/word_cloud_tokens_text_comb2.png', dpi=500)
plt.show()

In [None]:
wc.generate_from_frequencies(fd_lemmas)
plt.imshow(wc)
plt.axis("off")
plt.title('Frequent Lemmas:\nReddit r/askreddit Titles and Submission\nTop posts of past year (as of 9/15/23)', fontsize=13)
# plt.savefig('./images/word_cloud_lemmas_text_comb2.png', dpi=500)
plt.show()

## Comments

In [None]:
df_new['comments']

In [None]:
# I can pull data like comment body and author name from the praw comment object.

display(df_new['comments'].apply(lambda x: [top_level_comment.body for top_level_comment in x]))

display(df_new['comments'].apply(lambda x: [top_level_comment.author for top_level_comment in x]))

# Lots of GIFs in these comments that I should remove

In [None]:
# New df. Exploding the comments so each comment is its own row.
# This is only showing top-level comments at this point

df_comments_new_nlp = df_new[['id', 'title', 'datetime', 'comments']]
df_comments_new_nlp['comments'] = df_new['comments'].apply(list)
df_comments_new_nlp = df_comments_new_nlp.explode('comments')
df_comments_new_nlp.columns = ['post_id', 'post_title', 'post_datetime', 'comment_object']
df_comments_new_nlp = df_comments_new_nlp.reset_index().drop(columns='index')
df_comments_new_nlp.head()

In [None]:
df_comments_new_nlp['comment_object'].info()

In [None]:
df_comments_new_nlp.isna().sum()

In [None]:
df_comments_new_nlp = df_comments_new_nlp.dropna(subset=['comment_object'])

In [None]:
# New column for comment text

df_comments_new_nlp['comment'] = df_comments_new_nlp['comment_object'].apply(lambda x: x.body)

In [None]:
df_comments_new_nlp['comment'] = df_comments_new_nlp['comment'].apply(clean_string)

### Duplicates and useless comments

In [None]:
df_comments_new_nlp['comment'].value_counts()

In [None]:
# Drop deleted and removed comments

df_comments_new_nlp = df_comments_new_nlp[df_comments_new_nlp['comment'] != '[deleted]']
df_comments_new_nlp = df_comments_new_nlp[df_comments_new_nlp['comment'] != '[removed]']

In [None]:
# Drop comments that are gifs

df_comments_new_nlp = \
    df_comments_new_nlp[~df_comments_new_nlp['comment'].apply(lambda x: '![gif]' in x)]

In [None]:
# Drop blank and single character comments

df_comments_new_nlp = df_comments_new_nlp[~df_comments_new_nlp['comment'].apply(lambda x: len(x) in range(0,2))]

In [None]:
# Add a username column to aid in checking duplicates.

df_comments_new_nlp['user'] = df_comments_new_nlp['comment_object'].apply(lambda x: x.author.name if x.author else 'no_user')

In [None]:
# Drop duplicate comments by the same user

df_comments_new_nlp = df_comments_new_nlp.drop_duplicates(subset=['user', 'comment'], keep='faskredditt')

In [None]:
# Drop posts by AutoModerator, which is a moderator bot

df_comments_new_nlp = df_comments_new_nlp[df_comments_new_nlp['user'] != 'AutoModerator']

In [None]:
# Some duplicate comments may remain, but they are not by the same user so I will view them as unique

display(df_comments_new_nlp[df_comments_new_nlp.duplicated(subset=['comment'])])

display(df_comments_new_nlp[df_comments_new_nlp.duplicated(subset=['comment', 'user'])])

### Tokenizing Comments

In [None]:
# Spacy object of each comment