# EDA and cleaning

In [66]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import spacy

from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# function to strip html off of body text
def clean_data(review):
    no_html = BeautifulSoup(review).text
    lower_case = no_html.lower()
    
    return lower_case

---
## Posts

In [3]:
# read in data
posts = pd.read_csv('../data/posts.csv', index_col=0)
posts.head()

Unnamed: 0,title,selftext,subreddit
0,Follow me,,scifi
1,"Scientific research should be ethical, not evil..",,scifi
2,Book recommendations for series similar to Leo...,"Basically, a series where an individual or gr...",scifi
3,Recommendations for fantasy that turns out to ...,,scifi
4,Scifi Horror recommendation,[removed],scifi


In [4]:
posts.shape

(10000, 3)

In [5]:
# check for null values
posts.isna().sum()

title           0
selftext     3373
subreddit       0
dtype: int64

In [6]:
# check for revoved posts
len(posts[posts['selftext'] == '[removed]'])

1793

In [7]:
len(posts[posts['selftext'] == '[deleted]'])

223

In [8]:
# drop NaN and [removed] from selftext as these won't be helpful
posts_drop = posts.dropna()
posts_drop = posts_drop[posts_drop['selftext'] != '[removed]']
posts_drop = posts_drop[posts_drop['selftext'] != '[deleted]']

In [9]:
# reset index
posts_drop.reset_index(inplace=True)

In [10]:
posts_drop.drop(columns=['index'], inplace=True)
posts_drop.head(3)

Unnamed: 0,title,selftext,subreddit
0,Book recommendations for series similar to Leo...,"Basically, a series where an individual or gr...",scifi
1,"I just finished *Other Space* (2015), the flag...",I am not a shill for Yahoo! I swear!\n\nThe se...,scifi
2,Can I get some help once again with a book tit...,Okay this is getting rediculous but this is bu...,scifi


In [11]:
# Check how many are left
posts_drop['subreddit'].value_counts()

Fantasy    3820
scifi       791
Name: subreddit, dtype: int64

In [12]:
# check for html within the text
posts_drop['selftext'][1]

'I am not a shill for Yahoo! I swear!\n\nThe series is short (total binge time about as long as Peter Jackson\'s extended *The Return of the King*), and ends on sort of a cliffhanger which means almost anything I say will be a spoiler, so please just watch and enjoy. I promise you will laugh. The best description I can give without spoiling anything is this:\n\nImagine the basic premise of *Space:1999* or *Star Trek: Voyager*, but with the Crew of cadets from the *Star Trek: Deep Space Nine* episode ["Valiant"](https://www.imdb.com/title/tt0708657/), all wrapped up in the gallows humour, social commentary, and prop comedy elements of *Red Dwarf*. Joel Hodgson is a mentoring presence among a cast of young actors who are clearly taking his advice about comedic delivery and timing to heart, as their jokes hit the mark more often than not. The series manages to give its own lightheartedly cynical take on a big batch of science fiction\'s staple tropes, everything from cloned organ farming 

In [13]:
# strip html
posts_drop['clean_text'] = posts_drop['selftext'].apply(clean_data)



In [14]:
# clean text some more
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.replace.html

# remove unnecessary characters
posts_drop.replace({'\n': ''}, regex=True, inplace=True)
posts_drop.replace({'\[': ''}, regex=True, inplace=True)
posts_drop.replace({'\]': ''}, regex=True, inplace=True)
posts_drop.replace(r'\\', '', regex=True, inplace=True)
posts_drop.replace(r'>', '', regex=True, inplace=True) #remove converted &gt; to > by BeautifulSoup

# remove urls from text
# https://stackoverflow.com/questions/56358888/how-to-remove-https-links-from-a-string-column-in-pandas
posts_drop['clean_text'] = posts_drop['clean_text'].str.replace(r'https?://[^\s<>"]+|www\.[^\s<>"]+', "")

In [15]:
# check for html and lowercase
posts_drop['clean_text'][1]

'i am not a shill for yahoo! i swear!the series is short (total binge time about as long as peter jackson\'s extended *the return of the king*), and ends on sort of a cliffhanger which means almost anything i say will be a spoiler, so please just watch and enjoy. i promise you will laugh. the best description i can give without spoiling anything is this:imagine the basic premise of *space:1999* or *star trek: voyager*, but with the crew of cadets from the *star trek: deep space nine* episode "valiant"( all wrapped up in the gallows humour, social commentary, and prop comedy elements of *red dwarf*. joel hodgson is a mentoring presence among a cast of young actors who are clearly taking his advice about comedic delivery and timing to heart, as their jokes hit the mark more often than not. the series manages to give its own lightheartedly cynical take on a big batch of science fiction\'s staple tropes, everything from cloned organ farming to time dilation fields to first contact procedur

In [16]:
posts_drop.head(3)

Unnamed: 0,title,selftext,subreddit,clean_text
0,Book recommendations for series similar to Leo...,"Basically, a series where an individual or gr...",scifi,"basically, a series where an individual or gro..."
1,"I just finished *Other Space* (2015), the flag...",I am not a shill for Yahoo! I swear!The series...,scifi,i am not a shill for yahoo! i swear!the series...
2,Can I get some help once again with a book tit...,Okay this is getting rediculous but this is bu...,scifi,okay this is getting rediculous but this is bu...


In [17]:
# distinct values for clean text
posts_drop['clean_text'].nunique()

4410

In [18]:
# drop duplicates
posts_drop.drop_duplicates(inplace=True)

In [19]:
posts_drop['subreddit'].value_counts(normalize=True)

Fantasy    0.828559
scifi      0.171441
Name: subreddit, dtype: float64

Dataframe based on selftext ended up very unbalanced, so I will not be using posts selftext for this project

In [20]:
# save clean posts
posts_drop.to_csv('../data/posts_clean_text.csv')

In [21]:
# distinct values for titles
posts['title'].nunique()

9596

In [22]:
# drop duplicates from title
posts_title = posts.drop_duplicates()
posts_title.shape

(9727, 3)

In [25]:
posts_title['subreddit'].value_counts()

Fantasy    4981
scifi      4746
Name: subreddit, dtype: int64

In [24]:
# save clean titles
posts_title.to_csv('../data/posts_clean_title.csv')

Dataframe based on titles is more balanced than based on selftext, but let's see what comments can give us.

---
## Comments

In [26]:
# read in data
comments = pd.read_csv('../data/comments.csv', index_col=0)
comments.head()

Unnamed: 0,body,subreddit
0,use subtitles,scifi
1,"For neither ever, nor never",scifi
2,It reminds me of Fringe too,scifi
3,"Saw S01 in English, had to resort to subtitles...",scifi
4,"When season 2 was released, all the recaps I h...",scifi


In [27]:
comments.shape

(10000, 2)

In [28]:
# check for null values
comments.isna().sum()

body         0
subreddit    0
dtype: int64

In [29]:
# check for removed posts
len(comments[comments['body'] == '[removed]'])

139

In [30]:
len(comments[comments['body'] == '[deleted]'])

65

In [31]:
# since there are no NaNs, drop [removed] from body as these won't be helpful
comments_drop = comments[comments['body'] != '[removed]']
comments_drop = comments[comments['body'] != '[deleted]']
comments_drop

Unnamed: 0,body,subreddit
0,use subtitles,scifi
1,"For neither ever, nor never",scifi
2,It reminds me of Fringe too,scifi
3,"Saw S01 in English, had to resort to subtitles...",scifi
4,"When season 2 was released, all the recaps I h...",scifi
...,...,...
9995,Didn't Orson Scott Card donate his money to an...,Fantasy
9996,:( I liked the first. The second was weaker b...,Fantasy
9997,Lmao he is a bit of a chad honestly\n\nYA: You...,Fantasy
9998,"It's not at all terrible, it's really goddamn ...",Fantasy


In [32]:
# reset index and drop old index column
comments_drop.reset_index(inplace=True)
comments_drop.tail()

Unnamed: 0,index,body,subreddit
9930,9995,Didn't Orson Scott Card donate his money to an...,Fantasy
9931,9996,:( I liked the first. The second was weaker b...,Fantasy
9932,9997,Lmao he is a bit of a chad honestly\n\nYA: You...,Fantasy
9933,9998,"It's not at all terrible, it's really goddamn ...",Fantasy
9934,9999,I have not read any of the others on your list...,Fantasy


---

In [33]:
# drop old index
comments_drop.drop(columns=['index'], inplace=True)
comments_drop

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,body,subreddit
0,use subtitles,scifi
1,"For neither ever, nor never",scifi
2,It reminds me of Fringe too,scifi
3,"Saw S01 in English, had to resort to subtitles...",scifi
4,"When season 2 was released, all the recaps I h...",scifi
...,...,...
9930,Didn't Orson Scott Card donate his money to an...,Fantasy
9931,:( I liked the first. The second was weaker b...,Fantasy
9932,Lmao he is a bit of a chad honestly\n\nYA: You...,Fantasy
9933,"It's not at all terrible, it's really goddamn ...",Fantasy


In [34]:
# Check how many are left
comments_drop['subreddit'].value_counts()

scifi      4972
Fantasy    4963
Name: subreddit, dtype: int64

In [35]:
comments_drop['subreddit'].value_counts(normalize=True)

scifi      0.500453
Fantasy    0.499547
Name: subreddit, dtype: float64

Dataset based on comments is balanced, so lets do further cleaning. I will be using comments for this project.

In [36]:
# check for html within the text
comments_drop['body'][20]

'**Dark Matter**.'

In [37]:
# apply finction to strip html
comments_drop['clean_body'] = comments_drop['body'].apply(clean_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comments_drop['clean_body'] = comments_drop['body'].apply(clean_data)


In [38]:
# clean text some more
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.replace.html

# remove unnecessary characters
comments_drop.replace({'\n': ''}, regex=True, inplace=True)
comments_drop.replace({'\[': ''}, regex=True, inplace=True)
comments_drop.replace({'\]': ''}, regex=True, inplace=True)
comments_drop.replace({'@': ''}, regex=True, inplace=True)
comments_drop.replace({'\*': ''}, regex=True, inplace=True)
comments_drop.replace(r'\\', '', regex=True, inplace=True)
comments_drop.replace(r'>', '', regex=True, inplace=True) #remove converted &gt; to > by BeautifulSoup

# remove urls from text
# https://stackoverflow.com/questions/56358888/how-to-remove-https-links-from-a-string-column-in-pandas
comments_drop['clean_body'] = comments_drop['clean_body'].str.replace(r'https?://[^\s<>"]+|www\.[^\s<>"]+', "")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comments_drop['clean_body'] = comments_drop['clean_body'].str.replace(r'https?://[^\s<>"]+|www\.[^\s<>"]+', "")


In [39]:
# check for html, lowercase, and other unnecessary characters
comments_drop['clean_body'][20]

'dark matter.'

In [42]:
comments_drop['clean_body'][4401]

"do androids dream of electric sheep? - philip k. dickit's a detective set in the future and the movie bladerunner  was based on it."

In [43]:
# distinct values for clean body text
comments_drop['clean_body'].nunique()

9612

In [44]:
# drop duplicates from body text
comments_drop.drop_duplicates(inplace=True)
comments_drop.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comments_drop.drop_duplicates(inplace=True)


(9652, 3)

In [45]:
# create target column iwth 1 being scifi and 0 - fantasy
comments_drop['target'] = comments['subreddit'].map({'scifi': 1, 'Fantasy': 0})
comments_drop.drop(columns=['subreddit', 'body'], inplace=True)
comments_drop.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comments_drop['target'] = comments['subreddit'].map({'scifi': 1, 'Fantasy': 0})
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,clean_body,target
0,use subtitles,1
1,"for neither ever, nor never",1
2,it reminds me of fringe too,1
3,"saw s01 in english, had to resort to subtitles...",1
4,"when season 2 was released, all the recaps i h...",1


In [46]:
comments_drop['target'].value_counts(normalize=True)

0    0.500622
1    0.499378
Name: target, dtype: float64

In [47]:
# save clean titles
comments_drop.to_csv('../data/comments_clean.csv')

The posts data is very unbalanced, with fantasy having 7477 entries and scifi having 2596 entried, I wont be using selftext of a post.

#### Lemmatize comments

In [48]:
nlp = spacy.load("en_core_web_sm")

def lemmatize(sentence):
    doc = nlp(sentence)
    return ' '.join([token.lemma_ for token in doc])

In [49]:
comments_drop['lemma'] = comments_drop['clean_body'].apply(lemmatize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comments_drop['lemma'] = comments_drop['clean_body'].apply(lemmatize)


In [50]:
comments_drop.head(2)

Unnamed: 0,clean_body,target,lemma
0,use subtitles,1,use subtitle
1,"for neither ever, nor never",1,"for neither ever , nor never"


In [51]:
comments_drop.to_csv('../data/comments_lemma.csv')

---
### Most common words in comments

In [60]:
scifi_comments = pd.read_csv('../data/scifi_comments_10000.csv', index_col=0)
fantasy_comments = pd.read_csv('../data/fantasy_comments_10000.csv', index_col=0)

In [91]:
# repeat same process to clean text for common words in scifi
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.replace.html
scifi = scifi_comments[scifi_comments['body'] != '[removed]']
scifi = scifi[scifi['body'] != '[deleted]']

scifi.reset_index(inplace=True)
scifi.drop(columns=['index'], inplace=True)
scifi['clean_body'] = scifi['body'].apply(clean_data)
scifi.replace({'\n': ''}, regex=True, inplace=True)
scifi.replace({'\[': ''}, regex=True, inplace=True)
scifi.replace({'\]': ''}, regex=True, inplace=True)
scifi.replace({'@': ''}, regex=True, inplace=True)
scifi.replace({'\*': ''}, regex=True, inplace=True)
scifi.replace(r'\\', '', regex=True, inplace=True)
scifi.replace(r'>', '', regex=True, inplace=True) #remove converted &gt; to > by BeautifulSoup

# remove URLs
# https://stackoverflow.com/questions/56358888/how-to-remove-https-links-from-a-string-column-in-pandas
scifi['clean_body'] = scifi['clean_body'].str.replace(r'https?://[^\s<>"]+|www\.[^\s<>"]+', "")



In [90]:
# repeat same process to clean text for common words in fantasy
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.replace.html
fantasy = fantasy_comments[fantasy_comments['body'] != '[removed]']
fantasy = fantasy[scifi['body'] != '[deleted]']

fantasy.reset_index(inplace=True)
fantasy.drop(columns=['index'], inplace=True)
fantasy['clean_body'] = fantasy['body'].apply(clean_data)
fantasy.replace({'\n': ''}, regex=True, inplace=True)
fantasy.replace({'\[': ''}, regex=True, inplace=True)
fantasy.replace({'\]': ''}, regex=True, inplace=True)
fantasy.replace({'@': ''}, regex=True, inplace=True)
fantasy.replace({'\*': ''}, regex=True, inplace=True)
fantasy.replace(r'\\', '', regex=True, inplace=True)
fantasy.replace(r'>', '', regex=True, inplace=True) #remove converted &gt; to > by BeautifulSoup

# remove URLs
# https://stackoverflow.com/questions/56358888/how-to-remove-https-links-from-a-string-column-in-pandas
fantasy['clean_body'] = fantasy['clean_body'].str.replace(r'https?://[^\s<>"]+|www\.[^\s<>"]+', "")

  fantasy = fantasy[scifi['body'] != '[deleted]']

https://www.brandonsanderson.com/warbreaker-rights-and-downloads/" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.


In [63]:
scifi.head(2)

Unnamed: 0,all_awardings,associated_award,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,author_flair_text_color,author_flair_type,...,send_replies,stickied,subreddit,subreddit_id,total_awards_received,treatment_tags,edited,author_cakeday,distinguished,clean_body
0,,,CheatMaple,,,,,,,text,...,True,False,scifi,t5_2qh2z,0,,,,,use subtitles
1,,,CheatMaple,,,,,,,text,...,True,False,scifi,t5_2qh2z,0,,,,,"for neither ever, nor never"


In [88]:
# identify most common words in scifi
X_sci = scifi['clean_body']

cvect_sci = CountVectorizer(stop_words='english')
dtm_sci = cvect_sci.fit_transform(X_sci)

pd.DataFrame(
    dtm_sci.toarray(), 
    columns=cvect_sci.get_feature_names()
).sum().sort_values(ascending=False)[:20]

like      1737
just      1589
good      1071
really     972
series     967
think      913
time       897
don        852
star       822
read       802
book       787
sci        671
people     666
fi         660
movie      660
books      617
trek       609
space      585
story      583
great      570
dtype: int64

In [89]:
# identify most common words in fantasy
X_fan = fantasy['clean_body']

cvect_fan = CountVectorizer(stop_words='english')
dtm_fan = cvect_fan.fit_transform(X_fan)

pd.DataFrame(
    dtm_fan.toarray(), 
    columns=cvect_fan.get_feature_names()
).sum().sort_values(ascending=False)[:22]

book          2330
like          2163
read          2008
books         1997
series        1993
just          1853
fantasy       1615
really        1460
think         1392
good          1150
people        1050
don           1008
time           978
ve             947
characters     837
story          798
world          777
lot            739
reading        688
author         679
way            655
great          653
dtype: int64