In [16]:
import sys
import os
import numpy as np
import pandas as pd
import string

In [17]:
corpus_raw = pd.read_csv('../data/corpus_raw.csv')
corpus_raw.shape

(11594, 3)

---
# Raw Exploration

Note the stratification of the three classes

In [18]:
corpus_raw['class'].value_counts(normalize=True)

asoiaf         0.463688
tolkienfans    0.452820
Narnia         0.083491
Name: class, dtype: float64

Display 10 posts from the Tolkien class

In [28]:
corpus_raw[corpus_raw['class']=='tolkienfans'].head(10)

Unnamed: 0,class,id,text
0,tolkienfans,hbzj1y0,[https://www.bbc.co.uk/news/entertainment-arts...
1,tolkienfans,hbzj1y1,"RIP Bilbo. Ian Holm, has passed away aged 88"
2,tolkienfans,2subbe0,Let me start by saying that I enjoy many aspec...
3,tolkienfans,2subbe1,I have recut PJ‚Äôs Hobbit trilogy into a single...
4,tolkienfans,eqrkgb1,It was awfully nice of the 13 Dwarves to empty...
5,tolkienfans,ac25a01,127 years ago today J.R.R. Tolkien was born; H...
6,tolkienfans,dzlvf80,We all owe him an enormous debt for his years ...
7,tolkienfans,dzlvf81,Happy birthday to Christopher Tolkien who turn...
8,tolkienfans,cwre6j0,The forklift comes equipped with a Japanese NI...
9,tolkienfans,cwre6j1,"I'm just wondering, what was Tolkien's opinion..."


Display 10 posts from the Martin class

In [20]:
corpus_raw[corpus_raw['class']=='asoiaf'].head(10)

Unnamed: 0,class,id,text
1892,asoiaf,bo4ae60,In the inside the episode (which they need to ...
1893,asoiaf,bo4ae61,(Spoilers Extended) It should have been Davos
1894,asoiaf,bld81n0,The Bronn scene in S08E04 is some of the worst...
1895,asoiaf,bld81n1,[Spoilers Main] We need to talk about that Bro...
1896,asoiaf,bmxp8p1,(Spoilers Main) Euron actor Pilou Asb√¶k wishes...
1897,asoiaf,bny8be0,That was it. That was the scouring of the shir...
1898,asoiaf,bny8be1,(Spoilers Main) We just witnessed GRRM's endin...
1899,asoiaf,bipfrd0,"I love this show, and taking the show for what..."
1900,asoiaf,bipfrd1,(Spoilers Extended) The show has finally becom...
1901,asoiaf,bozxfa0,"The remaining 1% is Olly. \n\nFor real though,..."


Display 10 posts from the Lewis class

In [21]:
corpus_raw[corpus_raw['class']=='Narnia'].head(10)

Unnamed: 0,class,id,text
3504,Narnia,hr9xrl1,My Narnia books and bookends
3505,Narnia,fvblba1,Real life version of the forest between worlds
3506,Narnia,gcxc5l1,"Oh Susan, such a meme"
3507,Narnia,eo7ok91,Rereading these for the first time since middl...
3508,Narnia,hiwym81,My sister channeled my love of Narnia into thi...
3509,Narnia,g9i4rc1,Diy Turkish Delights Box
3510,Narnia,ev31281,I drew Aslan üòä I'm not an artist by any means....
3511,Narnia,fldwhv1,When the Pevensies were isolated...
3512,Narnia,hvvai81,There are not enough memes on this subreddit. ...
3513,Narnia,emg8v51,Narnia!


---
# Cleaning

Create a function to remove special characters, punctuation, capitalization, extra whitespace, and hyper-links

In [29]:
special = list(string.punctuation)
special.remove('-')
special.append('‚Äô')
special.append('‚Äú')
special.append('‚Äù')

def cleaner(status):
    
    status = status.replace('-', ' ')
    status = status.replace('‚Äì', ' ')
    status = status.replace('‚Äî', ' ')
    status = status.replace('\t', ' ')
    status = status.replace('\n', ' ')

    for item in special:
        status = status.replace(item, '')

    status = [word.lower() for word in status.split() if 'http' not in word]    
    status = ' '.join(status)
    
    return status
    

Apply function to corpus

In [23]:
corpus_clean = pd.DataFrame([])
corpus_clean['text'] = corpus_raw['text'].apply(cleaner)
corpus_clean.head()

Unnamed: 0,text
0,arts arts 13960349
1,rip bilbo ian holm has passed away aged 88
2,let me start by saying that i enjoy many aspec...
3,i have recut pjs hobbit trilogy into a single ...
4,it was awfully nice of the 13 dwarves to empty...


Add column for word count and change classes to numeric values

In [24]:
corpus_clean['word_count'] = [len(row.split()) for row in corpus_clean['text']]
corpus_clean['class'] = corpus_raw['class'].map({'Narnia':2, 'asoiaf':1, 'tolkienfans':0})
corpus_clean.head()

Unnamed: 0,text,word_count,class
0,arts arts 13960349,3,0
1,rip bilbo ian holm has passed away aged 88,9,0
2,let me start by saying that i enjoy many aspec...,1276,0
3,i have recut pjs hobbit trilogy into a single ...,15,0
4,it was awfully nice of the 13 dwarves to empty...,24,0


Drop documents that are only 3 words or less

In [25]:
corpus_clean.drop(corpus_clean[corpus_clean['word_count']<=3].index, axis=0, inplace=True)
corpus_clean.drop('word_count', axis=1, inplace=True)
corpus_clean.shape

(11097, 2)

Again note the stratification of the classes

In [26]:
corpus_clean['class'].value_counts(normalize=True)

1    0.480490
0    0.444084
2    0.075426
Name: class, dtype: float64

Save the clean corpus as .csv

In [27]:
corpus_clean.to_csv('../data/corpus.csv', header=True, index=False)