# Cleaning data

In [1]:
# Import libraries here.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import regex as re

### 1. Load in data

In [2]:
# read in data from my pickle
lan = pickle.load(open("../assets/lan.pkl", "rb"))
rbn = pickle.load(open("../assets/rbn.pkl", "rb"))

Select columns that I want to use

In [3]:
lan = lan[["title", "num_comments", "selftext", "created_utc", "subreddit"]]

In [4]:
rbn = rbn[["title", "num_comments", "selftext", "created_utc", "subreddit"]]

Merge my dataframes together

In [5]:
# merge data frames
reddit = pd.concat([lan, rbn])

### 2. Create a column that combines title and selftext

In [6]:
# fill null values in self text with an empty space
reddit["selftext"].fillna(" ", inplace = True)

In [7]:
# replace self text that was removed our deleted

reddit["selftext"] = reddit["selftext"].str.replace("\[removed\]", " ")
reddit["selftext"] = reddit["selftext"].str.replace("\[deleted\]", " ")

In [8]:
# remove URL's
reddit['selftext'] = reddit['selftext'].map(lambda x: re.sub('http[s]?:\/\/[^\s]*', ' ', x))

In [9]:
# remove mentions of subreddits
reddit['selftext'] = reddit['selftext'].map(lambda x: re.sub('\s[\/]?r\/[^\s]+', ' ', x))

In [10]:
# combine title and self text columns
reddit["title_selftext"] = reddit["title"] + " " + reddit["selftext"]

### 2. Clean up text data

Remove extra characters from text

In [11]:
# lowercase text
reddit["title_selftext"] = reddit["title_selftext"].str.lower()

In [12]:
# remove single quotations
reddit["title_selftext"] = reddit["title_selftext"].map(lambda x: re.sub("’", "", x))
reddit["title_selftext"] = reddit["title_selftext"].map(lambda x: re.sub("'", "", x))

In [13]:
# remove characters that are not word characters or digits
reddit["title_selftext"] = reddit["title_selftext"].map(lambda x: re.sub("[^\w\d]", " ", x))

In [14]:
# remove all characters that are not letters
reddit['title_selftext'] = reddit['title_selftext'].map(lambda x: re.sub("[^a-zA-Z]", " ", x))

In [15]:
# remove multiple spaces
reddit['title_selftext'] = reddit['title_selftext'].map(lambda x: re.sub("\s{2,6}", " ", x))

# Pickle data for later

In [16]:
pickle.dump(reddit, open("../assets/reddit.pkl", "wb"))