In [168]:
import pandas as pd
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from mbti import preprocess

from nltk.tokenize import RegexpTokenizer

In [169]:
reddit_df = pd.read_csv("./data/mbti_full_pull.csv")

In [170]:
reddit_df['author_flair_text'] = reddit_df['author_flair_text'].str.lower()

In [171]:
reddit_df['author_flair_text'] = reddit_df['author_flair_text'].apply(lambda x: x if len(x) <= 4 else 'drop')

In [172]:
reddit_df = reddit_df[reddit_df['author_flair_text'] != 'drop']

In [173]:
reddit_df['t/f'] = reddit_df['author_flair_text'].map(lambda x: x[2])

In [174]:
reddit_df['t/f'].value_counts()

t    851722
f    231912
Name: t/f, dtype: int64

In [175]:
reddit_df['body'] = reddit_df['body'].astype('str')

In [176]:
reddit_df = reddit_df[reddit_df['body'].apply(lambda x: len(x) > 2000)]

In [177]:
reddit_df = reddit_df.dropna()

In [178]:
reddit_df = reddit_df.reset_index(drop=True)

In [179]:
t_sample = reddit_df[reddit_df['t/f'] == 't'].sample(500, replace=False, random_state=22222)
f_sample = reddit_df[reddit_df['t/f'] == 'f'].sample(500, replace=False, random_state=22222)

reddit_sample = pd.concat([t_sample, f_sample], axis=0)

In [180]:
reddit_sample['t/f'].value_counts()

f    500
t    500
Name: t/f, dtype: int64

In [181]:
# Bringing in the prepreoccess class from mbti.py
# This class provides functions to clean and tokenize our text data
prepro = preprocess()

In [182]:
# Remove the target names in the actual posts
reddit_sample['clean_posts'] = reddit_sample['body'].apply(lambda x: prepro.replace_mbti(x))

In [183]:
# The posts contain multiple posts seperated by 3 pipes '|||' w/ no spaces between. 
# This function will remove pipes and replace with a space.
reddit_sample['clean_posts'] = reddit_sample['clean_posts'].apply(lambda x: prepro.pipe_remove(x))

# This funciton will remove URLs in the posts
reddit_sample['clean_posts'] = reddit_sample['clean_posts'].apply(lambda x: prepro.url_remove(x))

# This function will remove punctuation (dependent on what is passed in). This has `/``, `_`, `:` 
reddit_sample['clean_posts'] = reddit_sample['clean_posts'].apply(lambda x: prepro.punc_remove(x))

# Removes all characters that are not American Standard Code for Information Interchange
reddit_sample['clean_posts'] = reddit_sample['clean_posts'].apply(lambda x: prepro.remove_symbols(x))

# Fixes all spelling errors
# reddit_sample['clean_posts'] = reddit_sample['clean_posts'].apply(lambda x: prepro.spelling(x))

In [184]:
# This will create a column of cleaned words that have been tokenized.
pattern = r"(?u)\b\w\w+\b" # words with more than 2 letters
tokenizer = RegexpTokenizer(pattern) # instantiate tokenizer
reddit_sample['post_tokens'] = reddit_sample['clean_posts'].apply(tokenizer.tokenize) # Tokenize to new column

In [185]:
# Removing any remaining numeric digits
reddit_sample['post_tokens'] = reddit_sample['post_tokens'].apply(lambda x: prepro.remove_dig_token(x))

# Removing stopwords
reddit_sample['post_tokens'] = reddit_sample['post_tokens'].apply(lambda x: prepro.remove_stopwords(x))

# Lemmatizing the words with POS tagging
reddit_sample['post_tokens'] = reddit_sample['post_tokens'].apply(lambda x: prepro.lemmend_pos(x, pos=False)) 

In [186]:
# Joining the tokens together into one long string
reddit_sample['joined_tokens'] = reddit_sample['post_tokens'].apply(lambda x: prepro.join_tokens(x)) # Creating new column

In [187]:
reddit_sample['joined_tokens'].isna().sum()

0

In [188]:
path = './data/reddit_sample_clean500.csv'
reddit_sample.to_csv(path)