## Import libraries

In [1]:
import pandas as pd
import numpy as np
import re

# preprocessing imports
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer 

import warnings
warnings.filterwarnings('ignore')

## Load datasets

In [2]:
hp_df = pd.read_csv("../data/harrypotter.csv", encoding="utf-8", index_col=0)
hp_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2885 entries, 0 to 4988
Data columns (total 7 columns):
author         2885 non-null object
created_utc    2885 non-null int64
selftext       2885 non-null object
subreddit      2885 non-null object
title          2885 non-null object
url            2885 non-null object
created        2885 non-null float64
dtypes: float64(1), int64(1), object(5)
memory usage: 180.3+ KB


In [3]:
# Change created_utc column to datetime datetime datatype
hp_df['created_utc'] = pd.to_datetime(hp_df['created_utc'],unit='s')

In [4]:
# Confirm data format is correct
hp_df.head()

Unnamed: 0,author,created_utc,selftext,subreddit,title,url,created
0,Hoshcof,2019-06-20 17:12:25,If wizards and witches can use floo powder as ...,harrypotter,Question about communication,https://www.reddit.com/r/harrypotter/comments/...,1561022000.0
1,punkiy50,2019-06-20 16:52:11,Let's be Friends in Harry Potter: Wizards Unit...,harrypotter,Add ne guys,https://www.reddit.com/r/harrypotter/comments/...,1561021000.0
4,Sleeper____Service,2019-06-20 16:16:38,Ginny Weasley is a horse girl.,harrypotter,I just learned that Ginny Weasley's patronus i...,https://www.reddit.com/r/harrypotter/comments/...,1561019000.0
5,boujee-bob,2019-06-20 16:09:56,Doing yet another re read (never gets old) but...,harrypotter,Did Dumbledore know Lockhart was a fraud when ...,https://www.reddit.com/r/harrypotter/comments/...,1561018000.0
7,KingAchilles08,2019-06-20 16:07:31,Add me on the game! 3944 6644 2348\n\nSee you ...,harrypotter,Harry Potter Wizards Unite Friend code,https://www.reddit.com/r/harrypotter/comments/...,1561018000.0


In [5]:
# Add a binary label 
hp_df['is_hp'] = 1
# Remove redundant datetime column
hp_df.drop('created',axis=1,inplace=True)

In [6]:
fb_df = pd.read_csv("../data/fantasticbeasts.csv", encoding="utf-8", index_col=0)
fb_df.head()

Unnamed: 0,author,created_utc,selftext,subreddit,title,url,created
6,Zivio104,1559566660,Don't get me wrong I really like this series d...,FantasticBeasts,The problem I have with this series,https://www.reddit.com/r/FantasticBeasts/comme...,1559537860.0
9,MetalVenomLudens,1559486360,Mods can remove this if they feel this post is...,FantasticBeasts,Should we create a Discord server for Fantasti...,https://www.reddit.com/r/FantasticBeasts/comme...,1559457560.0
10,MetalVenomLudens,1559486294,Mods can remove this if they feel this post is...,FantasticBeasts,Should we create a Discord server for Fantasti...,https://www.reddit.com/r/FantasticBeasts/comme...,1559457494.0
11,Stephen111110,1559479412,I wont go too deep incase of spoilers. But.\nT...,FantasticBeasts,Theory on Credence,https://www.reddit.com/r/FantasticBeasts/comme...,1559450612.0
13,CubanPete18,1559432978,"At a certain point in the film, Krall refers t...",FantasticBeasts,Question about Dumbledore in The Crimes of Gri...,https://www.reddit.com/r/FantasticBeasts/comme...,1559404178.0


In [7]:
fb_df.shape

(1221, 7)

In [8]:
# Change created_utc column to datetime datetime datatype
fb_df['created_utc'] = pd.to_datetime(fb_df['created_utc'],unit='s')

In [9]:
# Add a binary label 
fb_df['is_hp'] = 0
# Remove redundant datetime column
fb_df.drop('created',axis=1,inplace=True)

In [10]:
df = pd.concat([hp_df, fb_df], ignore_index=True)

In [11]:
df.shape

(4106, 7)

In [12]:
df = df.query("subreddit == 'FantasticBeasts' | subreddit == 'harrypotter'")

In [13]:
df = df.reset_index(drop=True)

In [14]:
df['subreddit'].value_counts()

harrypotter        2885
FantasticBeasts    1127
Name: subreddit, dtype: int64

## Tokenize titles and posts

1. Create a concatenated string of titles and posts 

In [15]:
df['all_text'] = df['title'] + ' ' + df['selftext']

In [16]:
rt = RegexpTokenizer(r"[\w\/\']+") # regex to include words, slash characters for urls, apostrophes

In [17]:
df.all_text.sample(5)

825     Favorite character Whose everyone’s favorite c...
44      I forgot how amazing the books are I reread th...
3265    Muggles, no maj, and “can’t spells” What did e...
2026    Need Audible suggestions? So I’ve re-listened ...
529     Relative Strength of Voldemort's Horcruxes If ...
Name: all_text, dtype: object

2. Tokenize the new feature (combination of title and body text) by removing the weird HTML formatted characters and numbers

In [None]:
for i, text in enumerate(df.all_text): # for each string in our combined column
    text_loop = text.replace('amp;',' ')
    text_loop = text_loop.replace('x200B;',' ') 
    text_loop = text_loop.replace('nbsp;',' ')
    text_loop = text_loop.replace('https',' ')
    df.all_text.iloc[i] = text_loop.replace('\n',' ').strip()

In [None]:
text_tokens = []  

for i in range(len(df.all_text)):
    loop_tokens = rt.tokenize(df.all_text.iloc[i].lower()) 
    for j, token in enumerate(loop_tokens):
        if re.match(r"\d+[\w]*", token): #remove numbers
            loop_tokens[j] = ''
        if ('harry' in token)|('potter' in token)|('fantastic' in token)|('beasts' in token):
            loop_tokens[j] = ''
    text_tokens.append(loop_tokens)                   

In [20]:
# create a list of string to store the tokenized text for vectorizer
posts_list = []

for post in text_tokens:
    posts_list.append(' '.join(post))

## Stem tokens

Improve the modeling ability of the strings by using a stemmer, which trims characters from each word to convert it to a stem. Words will register as equivalent during feature extraction if they share a stem (i.e., computer,computing,computed all result in a stem of comput).

In [21]:
# Instantiate object of class PorterStemmer.
p_stemmer = PorterStemmer()

In [22]:
# Stem tokens.
posts_st = [] # posts tokenized stemmed

for post in text_tokens:
    post_st = [] # empty post stems
    for word in post:
        word_st = p_stemmer.stem(word) # get stem of word
        post_st.append(word_st) # add to post list
    posts_st.append(post_st)  # add post list to stem matrix
    
posts_st[1][:10]

['add', 'ne', 'guy', "let'", 'be', 'friend', 'in', '', '', 'wizard']

A bit weird that 'harry' turns into 'harri'. But this shouldn't affect the modeling. Let's check out the model accuracy later.

In [23]:
# create a list of string to store the tokenized stems for vectorizer
posts_st_list = []

for post in posts_st:
    posts_st_list.append(' '.join(post))

## Lemmatize tokens

Try lemmatizing as well. It is however arguable here whether stemming/lemmatizing would add value to the models at all, since J.K. Rowling coined a lot of new words from old English in the novels, which are quite important in differentiating the theme. It is suspected that stemming/lemmatizing might actually do the opposite by taking out the distintiveness.

Therefore, we will just create these tokens and assess each type of tokens separately and see if they are going to help improve the initial model.

In [24]:
# Instantiate lemmatizer. 
p_lemmatizer = WordNetLemmatizer()

In [25]:
# Lemmatize tokens.
posts_lm = [] # posts tokenized stemmed

for post in text_tokens:
    post_lm = [] # empty post stems
    for word in post:
        word_lm = p_lemmatizer.lemmatize(word) # get stem of word
        post_lm.append(word_lm) # add to post list
    posts_lm.append(post_lm)  # add post list to stem matrix
    
posts_lm[1][:10]

['add', 'ne', 'guy', "let's", 'be', 'friend', 'in', '', '', 'wizard']

Lemmatization looks to generate more semantically sensible tokens, as compared to stemming.

In [26]:
# create a list of string to store the lemmatized tokens for vectorizer
posts_lm_list = []

for post in posts_lm:
    posts_lm_list.append(' '.join(post))

## Store original tokens for posts, stemmed and lemmatized to the dataframes

In [27]:
df_model = pd.DataFrame(data=[posts_list, posts_st_list, posts_st_list], index=['post','post_st','post_lm'])

In [28]:
df_model = df_model.T

In [29]:
df_model['all_text'] = df['all_text']
df_model['is_hp'] = df['is_hp']

In [30]:
df_model.head()

Unnamed: 0,post,post_st,post_lm,all_text,is_hp
0,question about communication if wizards and wi...,question about commun if wizard and witch can ...,question about commun if wizard and witch can ...,Question about communication If wizards and wi...,1
1,add ne guys let's be friends in wizards unit...,add ne guy let' be friend in wizard unit my ...,add ne guy let' be friend in wizard unit my ...,Add ne guys Let's be Friends in Harry Potter: ...,1
2,i just learned that ginny weasley's patronus i...,i just learn that ginni weasley' patronu is a ...,i just learn that ginni weasley' patronu is a ...,I just learned that Ginny Weasley's patronus i...,1
3,did dumbledore know lockhart was a fraud when ...,did dumbledor know lockhart wa a fraud when he...,did dumbledor know lockhart wa a fraud when he...,Did Dumbledore know Lockhart was a fraud when ...,1
4,wizards unite friend code add me on the game...,wizard unit friend code add me on the game ...,wizard unit friend code add me on the game ...,Harry Potter Wizards Unite Friend code Add me ...,1


## Train/Test split

In [31]:
X = df_model[['post','post_st','post_lm','all_text']]
y = df_model['is_hp']

In [32]:
# Split train/test set with 70/30 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [33]:
y_train = pd.DataFrame(y_train, columns=['is_hp'])
y_test = pd.DataFrame(y_test, columns=['is_hp'])

In [34]:
X_train.to_csv('../tokenized_data/X_train.csv', index=True)
X_test.to_csv('../tokenized_data/X_test.csv', index=True)
y_train.to_csv('../tokenized_data/y_train.csv', index=True)
y_test.to_csv('../tokenized_data/y_test.csv', index=True)