In [1]:
from collections import deque
import json
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import numpy as np
import re

Import data from file

In [2]:
with open("compilation2.json", "r") as content:
    data = json.load(content)['data']

# Data Cleaning

Set types of characters and sequences to be filtered out

In [3]:
links = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
esc = re.compile("\n|u2019|u201c|u201d")

Set stop words

In [4]:
stop = set(stopwords.words('english'))
stop.update(['x200b', 'u', 'get', 'im', 'dont', 'go', 'like', 'also', 'real', 'really',
             'many', 'even', 'ive', 'way', 'us', 'wont', 'would', 'seem', 'whats', 'ever',
             'theyve', 'much', 'however', 'among', 'upon', 'every', 'around', 'could', 'maybe',
             'seems', 'anyone', 'anything', 'one', 'else', 'take', 'make', 'see', 'amp',
             'something', 'say', 'people', 'know', 'think', 'thing'])

Returns the Lemmatizer tag for the part of speech of a provided word

In [5]:
def part_of_speech(word):
    tag = pos_tag([word])[0][1][0].lower()
    if tag == wordnet.ADJ: 
        pos = 'j'
    elif tag == wordnet.VERB: 
        pos = 'v'
    elif tag == wordnet.ADV:
        pos = 'r'
    else:
        pos = 'n'
    
    return pos

Process text: remove documents with no content, remove undesirable characters, and convert words to their stems

In [6]:
def process(data):
    lemmatizer = WordNetLemmatizer()
    statements = deque()
    originals = deque()
    for post in data:
        if post.get('selftext') and post['selftext'] != "[removed]" and post['selftext'] != "":
            sentence = post['selftext']
            
            sentence = sentence.lower()  # convert all characters to lower case
            sentence = links.sub('', sentence)   # remove  links from text
            sentence = esc.sub(' ', sentence)    # remove escape sequences from text
            sentence = re.sub(r'[^\w\s]',' ', sentence)   # remove non-alphanumeric characters
            
            words = word_tokenize(sentence)
            words = [i for i in words if i not in stop]
            words = [lemmatizer.lemmatize(i, part_of_speech(i)) for i in words]
            if(len(words) > 3): 
                statements.append(" ".join(words))
                originals.append(post['selftext'])
    return statements, originals

In [7]:
statements, originals = process(data)
statements = np.array(statements)
originals = np.array(originals)
print(originals[0:5])
print(statements[0:5])

['*The TV told me outside is dangerous*\n\n*The TV told me to stay in my home*\n\n*The TV told me human interaction bad*\n\n*The TV told me I can’t hang out with you*\n\n*The TV told me to call the cops if I see you with your friends*\n\n*The TV told me to trust no one*'
 "Iv'e linked a .io game from itch.io that's pretty relevant to society as we see it now. It blows my mind to see people have the news tell them how to feel/act. It's a flash game where you control the media and set the narrative for a social downfall.  \n\n[https://ncase.itch.io/wbwwb](https://ncase.itch.io/wbwwb)"
 'After all, the protest seemed it had no end in sight?\n\nhttps://old.reddit.com/r/AskReddit/comments/fpl9nn/hong_kong_protesters_how_is_it_going_on_the_front/'
 '""Bill Gates predicted this was going to happen" NO he KNEW this was going to happen and is about to Holocaust our asses thanks to you fools'
 'that witch has ties to so many celebrities probably the handler for all of them looks like she was spr

In [8]:
words = []
for stmt in statements:
    for word in stmt.split():
        words.append(word)
words = np.unique(words)
            
print("Total statements: ", len(statements))
print("Total words: ", len(words))

Total statements:  9065
Total words:  40504


In [9]:
def get_stopwords():
    return stop

def get_processed():
    return statements

def get_originals():
    return originals