# Preprocessing for the Reddit Dataset

In [1]:
import pandas as pd
import regex as re
import nltk
from pyarrow import feather
from datetime import datetime
from nltk.corpus import stopwords

# TODO: at first execution download nltk stopwords
#nltk.download('stopwords')
#nltk.download('punkt')

## Data Loading

In [2]:
data = pd.read_json('reddit_res/reddit_data.json').set_index('id')

# drop linked resources
data.drop(columns=['permalink', 'url'], inplace=True)
# remove authors as the twitter dataset doesn't have that either
# TODO: decide if we want to keep it
data.drop(columns=['author'], inplace=True)

post_data = data.drop(columns=['comments']).reset_index(drop=True)


def process_comments(comment_df_list_, comment_data_):
    for comments_ in comment_data_:
        if len(comments_) == 0:
            continue
        comments_ = pd.DataFrame.from_records(comments_).set_index('id')
        comments_['created'] = comments_['created'].astype(int)
        comments_.drop(columns=['permalink', 'author'], inplace=True)
        comments_.reset_index(drop=True, inplace=True)
        process_comments(comment_df_list_, comments_['comments']) # process comments of comments
        comment_df_list_.append(comments_.drop(columns=['comments']))


comment_list_dfs = []
process_comments(comment_list_dfs, data['comments'])
comment_data = pd.concat(comment_list_dfs).reset_index(drop=True)

# Clean Data
- based on twitter preprocessing
- also removes links

In [3]:
def remove_url(text):
    return re.sub(r'https?://\S+|www\.\S+', r"", text)

post_data['title'] = post_data['title'].apply(lambda title: remove_url(title))
post_data['text'] = post_data['text'].apply(lambda text: remove_url(text))
comment_data['text'] = comment_data['text'].apply(lambda text: remove_url(text))


def general_cleaning(stop_words_, text):
    tokens = nltk.tokenize.word_tokenize(text, language='german')
    text_tmp = ""

    for token in tokens:
        token_low = token.lower()
        if token_low not in stop_words_:
            text_tmp += token_low + ' '

    if text_tmp == "":
        return pd.NA
    return text_tmp

stop_words = set(stopwords.words('german'))
stop_words.add('\n')
stop_words.add('\\-')
stop_words.add('http')
stop_words.add('https')
stop_words.add('//')
stop_words.add('(')
stop_words.add(')')
stop_words.add('[')
stop_words.add(']')
stop_words.add('removed')
stop_words.add('deleted')

# TODO: decide if we want to remove , ; : and so on
stop_words.add('!')
stop_words.add('.')
stop_words.add('?')
stop_words.add(',')
stop_words.add(';')
stop_words.add(':')
stop_words.add('|')
stop_words.add('<')
stop_words.add('>')
stop_words.add('-')
stop_words.add('..')
stop_words.add('...')
stop_words.add("''")
stop_words.add('``')
stop_words.add('´´')
# remove more if needed!


post_data['subreddit'] = post_data['subreddit'].str.lower()
post_data['title'] = post_data['title'].apply(lambda title: general_cleaning(stop_words, title))
post_data['text'] = post_data['text'].apply(lambda text: general_cleaning(stop_words, text))

comment_data['text'] = comment_data['text'].apply(lambda text: general_cleaning(stop_words, text))

post_data.dropna(inplace=True)
comment_data.dropna(inplace=True)

## Get timestamps in human-readable format

In [4]:
post_data['created'] = post_data['created'].apply(lambda timestamp: datetime.utcfromtimestamp(timestamp))
comment_data['created'] = comment_data['created'].apply(lambda timestamp: datetime.utcfromtimestamp(timestamp))

## Export as Feather Datasets

In [5]:
feather.write_feather(post_data, 'reddit_res/posts.ftr')
feather.write_feather(comment_data, 'reddit_res/comments.ftr')