In [2]:
import pandas as pd
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from nlp import preprocess

from nltk.tokenize import RegexpTokenizer

In [3]:
reddit_df = pd.read_csv("./data/mbti_full_pull.csv")

In [4]:
reddit_df['author_flair_text'] = reddit_df['author_flair_text'].str.lower()

In [5]:
reddit_df['author_flair_text'] = reddit_df['author_flair_text'].apply(lambda x: x if len(x) <= 4 else 'drop')

In [6]:
reddit_df = reddit_df[reddit_df['author_flair_text'] != 'drop']

In [7]:
reddit_df['t/f'] = reddit_df['author_flair_text'].map(lambda x: x[2])

In [8]:
reddit_df['t/f'].value_counts()

t    851722
f    231912
Name: t/f, dtype: int64

In [9]:
reddit_df['body'] = reddit_df['body'].astype('str')

In [10]:
reddit_df = reddit_df[reddit_df['body'].apply(lambda x: len(x) > 2000)]

In [11]:
reddit_df = reddit_df.dropna()

In [12]:
reddit_df = reddit_df.reset_index(drop=True)

In [13]:
t_sample = reddit_df[reddit_df['t/f'] == 't'].sample(500, replace=False, random_state=22222)
f_sample = reddit_df[reddit_df['t/f'] == 'f'].sample(500, replace=False, random_state=22222)

reddit_sample = pd.concat([t_sample, f_sample], axis=0)

In [14]:
reddit_sample['t/f'].value_counts()

f    500
t    500
Name: t/f, dtype: int64

In [15]:
# Bringing in the prepreoccess class from mbti.py
# This class provides functions to clean and tokenize our text data
prepro = preprocess()

In [16]:
# Remove the target names in the actual posts
reddit_sample['clean_posts'] = reddit_sample['body'].apply(lambda x: prepro.replace_mbti(x))

In [17]:
# The posts contain multiple posts seperated by 3 pipes '|||' w/ no spaces between. 
# This function will remove pipes and replace with a space.
reddit_sample['clean_posts'] = reddit_sample['clean_posts'].apply(lambda x: prepro.pipe_remove(x))

# This funciton will remove URLs in the posts
reddit_sample['clean_posts'] = reddit_sample['clean_posts'].apply(lambda x: prepro.url_remove(x))

# This function will remove punctuation (dependent on what is passed in). This has `/``, `_`, `:` 
reddit_sample['clean_posts'] = reddit_sample['clean_posts'].apply(lambda x: prepro.punc_remove(x))

# Removes all characters that are not American Standard Code for Information Interchange
reddit_sample['clean_posts'] = reddit_sample['clean_posts'].apply(lambda x: prepro.remove_symbols(x))

# Fixes all spelling errors
# reddit_sample['clean_posts'] = reddit_sample['clean_posts'].apply(lambda x: prepro.spelling(x))

In [18]:
# This will create a column of cleaned words that have been tokenized.
pattern = r"(?u)\b\w\w+\b" # words with more than 2 letters
tokenizer = RegexpTokenizer(pattern) # instantiate tokenizer
reddit_sample['post_tokens'] = reddit_sample['clean_posts'].apply(tokenizer.tokenize) # Tokenize to new column

In [19]:
# Removing any remaining numeric digits
reddit_sample['post_tokens'] = reddit_sample['post_tokens'].apply(lambda x: prepro.remove_dig_token(x))

# Removing stopwords
reddit_sample['post_tokens'] = reddit_sample['post_tokens'].apply(lambda x: prepro.remove_stopwords(x))

# Lemmatizing the words with POS tagging
reddit_sample['post_tokens'] = reddit_sample['post_tokens'].apply(lambda x: prepro.lemmend_pos(x, pos=False)) 

In [20]:
# Joining the tokens together into one long string
reddit_sample['joined_tokens'] = reddit_sample['post_tokens'].apply(lambda x: prepro.join_tokens(x)) # Creating new column

In [21]:
reddit_sample['joined_tokens'].isna().sum()

0

In [23]:
reddit_sample

Unnamed: 0,author_flair_text,body,subreddit,t/f,clean_posts,post_tokens,joined_tokens
16807,intj,Start reading [Mr. Money Mustache.](http://www...,intj,t,Start reading [Mr. Money Mustache.]( and subsc...,"[Start, reading, Mr, Money, Mustache, subscrib...",Start reading Mr Money Mustache subscribe frug...
15826,intj,&gt; I honestly think it just upsets people th...,intj,t,&gt; I honestly think it just upsets people th...,"[gt, honestly, think, upset, people, work, eas...",gt honestly think upset people work easier say...
11588,istp,"My interpretation, without having read any exp...",mbti,t,"My interpretation, without having read any exp...","[My, interpretation, without, read, explanatio...",My interpretation without read explanation The...
12667,intj,As someone who is making the current transitio...,intj,t,As someone who is making the current transitio...,"[As, someone, making, current, transition, car...",As someone making current transition career tr...
2814,intp,&gt; cruel even though I know it isn't intende...,INTP,t,&gt; cruel even though I know it isnt intended...,"[gt, cruel, even, though, know, isnt, intended...",gt cruel even though know isnt intended way Cr...
...,...,...,...,...,...,...,...
3290,enfj,thank you @sugoruyo for your in-depth reply. i...,INTP,f,thank you @sugoruyo for your in-depth reply. i...,"[thank, sugoruyo, depth, reply, comforting, in...",thank sugoruyo depth reply comforting informat...
16677,infj,"Oh, but it's actually great advice! I have to ...",infj,f,"Oh, but its actually great advice! I have to r...","[Oh, actually, great, advice, remind, time, ti...",Oh actually great advice remind time time Ther...
499,enfp,"Just give it some time until your ""attachment""...",ENFP,f,"Just give it some time until your ""attachment""...","[Just, give, time, attachment, people, past, f...",Just give time attachment people past fade Try...
676,infj,Holy narrow view batman. This whole video hone...,infj,f,Holy narrow view batman. This whole video hone...,"[Holy, narrow, view, batman, This, whole, vide...",Holy narrow view batman This whole video hones...


In [24]:
path = './data/reddit_sample.csv'
reddit_sample.to_csv(path)