# Import Libraies

In [487]:
import pandas as pd
import re
import numpy as np
from nltk import sent_tokenize

# Import Data

In [488]:
df = pd.read_csv("../data/subreddit_posts.csv", low_memory=False)

In [489]:
df_quotes = df[df['subreddit'] == 'quotes']
df_oneliners = df[df['subreddit'] == 'oneliners']

In [490]:
df_quotes.tail(3)

Unnamed: 0,author_fullname,created_utc,full_link,id,is_original_content,num_comments,pinned,score,selftext,subreddit,title,upvote_ratio,removed_by_category
138225,t2_21w0u,1205266764,https://www.reddit.com/r/quotes/comments/6bpbp...,6bpbp,,0,,2,,quotes,"1978 - Ginger Rogers -""You've got to stand for...",,
138226,t2_21w0u,1205266687,https://www.reddit.com/r/quotes/comments/6bpbf...,6bpbf,,1,,0,,quotes,"Nov. 9, 1978 - Alex Hamilton -""Those who stand...",,
138227,t2_21w0u,1205266552,https://www.reddit.com/r/quotes/comments/6bpb8...,6bpb8,,1,,1,,quotes,"Rev. Peter Marshall ""Give to us clear vision t...",,


In [491]:
df_oneliners.tail(3)

Unnamed: 0,author_fullname,created_utc,full_link,id,is_original_content,num_comments,pinned,score,selftext,subreddit,title,upvote_ratio,removed_by_category
164511,t2_361vl,1246083231,https://www.reddit.com/r/oneliners/comments/8w...,8w4do,,0,,1,,oneliners,"We never really grow up, we only learn how to ...",,
164512,t2_361vl,1246083177,https://www.reddit.com/r/oneliners/comments/8w...,8w4dg,,0,,3,,oneliners,Sex is not the answer. Sex is the question. “Y...,,
164513,t2_361vl,1246082941,https://www.reddit.com/r/oneliners/comments/8w...,8w4cr,,0,,3,,oneliners,"If sex is a pain in the ass, then you’re doing...",,


In [492]:
print((df_quotes.shape))
print(df_oneliners.shape)

(138228, 13)
(26286, 13)


# Preliminary Data Cleaning

## Clean Duplicates

Some posts could be reposts with same contents, we will remove such duplicates.

In [493]:
# First check if our data contains any duplicated ids

# For r/quotes
print(len(df_quotes['id'].unique()))
# For r/oneliners
print(len(df_oneliners['id'].unique()))

138228
26286


We see that all rows of our datasets contain unique posts. Next we will check to see if the title has any duplicates.

In [494]:
print(len(df_quotes['title'].unique()))
print(len(df_oneliners['title'].unique()))

130168
25528


In [495]:
print("No. of duplicate title for r/quotes: ", df_quotes.shape[0] - len(df_quotes['title'].unique()))
print("No. of duplicate title for r/oneliners: ",df_oneliners.shape[0]  - len(df_oneliners['title'].unique()))

No. of duplicate title for r/quotes:  8060
No. of duplicate title for r/oneliners:  758


We will keep the earliest occurence of the posts while dropping other duplicates.

In [496]:
# drop dups
df_quotes = df_quotes.drop_duplicates(subset = "title", keep = "last")
df_oneliners = df_oneliners.drop_duplicates(subset = "title", keep = "last")

In [497]:
print(df_quotes.shape)
print(df_oneliners.shape)

(130168, 13)
(25528, 13)


We have now dropped all title duplicates.

## Missing Titles

Since for each subreddit, the quotes and oneliners will typically appear in the title, we are mainly concerned with missing titles. 

In [498]:
df_quotes[df_quotes['title'].isnull()]

Unnamed: 0,author_fullname,created_utc,full_link,id,is_original_content,num_comments,pinned,score,selftext,subreddit,title,upvote_ratio,removed_by_category
91024,,1494895719,https://www.reddit.com/r/quotes/comments/6bec4...,6bec4s,,1,,0,[deleted],quotes,,,


In [499]:
df_oneliners[df_oneliners['title'].isnull()]

Unnamed: 0,author_fullname,created_utc,full_link,id,is_original_content,num_comments,pinned,score,selftext,subreddit,title,upvote_ratio,removed_by_category


We have 1 case of missing title for df_quotes, we will remove this title.

In [500]:
df_quotes = df_quotes[~df_quotes['title'].isnull()]

In [501]:
df_quotes.shape

(130167, 13)

We have now dropped the row with missing title.

## Remove titles which are too short.

Quotes or oneliners which are too short are unlikely to be legitimate quotes/oneliners. For the project, we will remove rows with titles less than 2 words.

In [502]:
def sentence_length(row):
    # Using regex to find all words, including those with quotes such as what's
    split_str = re.findall(r"[\w+|\w+\'\w+]+", row['title'])
    row['title_length'] = len(split_str)
    return row
    
df_quotes = df_quotes.apply(sentence_length, axis = 1)

In [503]:
df_oneliners = df_oneliners.apply(sentence_length, axis = 1)

In [504]:
print(df_quotes[df_quotes['title_length']<=2][["title", "title_length"]].shape)
df_quotes[df_quotes['title_length']<=2][["title", "title_length"]].head()

(2330, 2)


Unnamed: 0,title,title_length
19,Risking Everything,2
93,Unknown quoter,2
183,"brash, adj.",2
204,by : me ...,2
234,Forgive yourself,2


In [505]:
print(df_oneliners[df_oneliners['title_length']<=2][["title", "title_length"]].shape)
df_oneliners[df_oneliners['title_length']<=2][["title", "title_length"]].head()

(323, 2)


Unnamed: 0,title,title_length
138299,Breast feet,2
138671,Hambur ger,2
138678,My dad,2
138819,⎯⎯⎯,0
139342,Structural integrity,2


We see that these titles may contain some quotes/oneliners, but they may not really be significant and meaningful quotes or oneliners. Many of them also do not contain quotes or are just symbols. Furthermore, they do not make up a significant percentage of our dataset.

As such, we will not include them in our analysis.

In [506]:
df_quotes = df_quotes[df_quotes['title_length']>2]
df_oneliners = df_oneliners[df_oneliners['title_length']>2]

In [507]:
print(df_quotes.shape)
print(df_oneliners.shape)

(127837, 14)
(25205, 14)


## Remove titles with more than 1 sentence

Since r/oneliners contains only titles with 1 sentence, to the comparison fair, we will only retain those quotes from r/quotes with 1 sentence. 

In [469]:
# First remove columns which are irrelevant
df_quotes = df_quotes[["title", 'title_length', 'subreddit']].reset_index(drop = True)
df_oneliners = df_oneliners[["title", 'title_length', 'subreddit']].reset_index(drop = True)

In [470]:
def num_sentence(row):
    sentence_list = sent_tokenize(row['title'])
    if len(sentence_list) == 1:
        return row
    else:
        pass
df_quotes = df_quotes.apply(num_sentence, axis = 1)
df_oneliners = df_oneliners.apply(num_sentence, axis = 1)

In [471]:
df_quotes = df_quotes[~df_quotes['title'].isnull()].reset_index(drop = True)
df_oneliners = df_oneliners[~df_oneliners['title'].isnull()].reset_index(drop = True)

In [472]:
print(df_quotes.shape)
print(df_oneliners.shape)

(54188, 3)
(22479, 3)


## Extract content of quote

In [473]:
# Example quote
df_quotes['title'][10]

'“The worst thing someone can do to each other is pretend that they care.” ― Sarvesh Jain'

The titles from r/quotes may also contain author names, which we do not want. Sometimes, the title may not be quotes as well. Hence, for this step, we will obtain just the contents between two quotation marks, which will be our quotes.

In [485]:
# Function to get string in between 2 quotation marks
def get_just_quotes(row):
    match = re.search(r'“(.*?)”|"(.*)"',row['title'])
    if match:
        row['title'] = match.group().strip('""').strip('“”')
    else:
        row['title'] = np.nan
        pass
    return row
df_quotes = df_quotes.apply(get_just_quotes, axis = 1)

In [475]:
df_quotes = df_quotes[~df_quotes['title'].isnull()].reset_index(drop = True)

In [476]:
print(df_quotes.shape)
print(df_oneliners.shape)

(29726, 3)
(22479, 3)


In [477]:
df_quotes.head()

Unnamed: 0,title,title_length,subreddit
0,"We all think we're right, and then we change o...",17.0,quotes
1,Life feels good cause I just finger the bad bitch,10.0,quotes
2,"I am a God, So hurry up with my damn massage, ...",24.0,quotes
3,"Stories may well be lies, but they are good li...",16.0,quotes
4,"Don't pursue happiness, rather design a lifest...",20.0,quotes


In [478]:
df_oneliners.head()

Unnamed: 0,title,title_length,subreddit
0,“d” is the letter “a” with a boner,8.0,oneliners
1,I got a Masterclass subscription for my birthd...,14.0,oneliners
2,That avocado crisis was between the Hass and H...,10.0,oneliners
3,"My wife wanted two kittens, but I am the man i...",19.0,oneliners
4,Jokes on death are dead nowadays...,6.0,oneliners


# Export CSV

We now have two dataframes containing 29726 posts from r/quotes and 22479 from r/oneliners. Now let's combine the dataframes and export as csv in order to proceed with the next steps.

In [479]:
# combine both df by concatenating
df_combined = pd.concat([df_quotes, df_oneliners]).reset_index(drop = True)
df_combined.shape

(52205, 3)

In [483]:
df_combined.isnull().sum()

title           0
title_length    0
subreddit       0
dtype: int64

In [480]:
df_combined.tail()

Unnamed: 0,title,title_length,subreddit
52200,How about you?,3.0,oneliners
52201,Abstraction is drawing strength from a pool of...,18.0,oneliners
52202,Wanna hear a good joke?,5.0,oneliners
52203,"We never really grow up, we only learn how to ...",13.0,oneliners
52204,"If sex is a pain in the ass, then you’re doing...",14.0,oneliners


In [484]:
# Export as csv
#df_combined.to_csv("../data/subreddit_posts_cleaned.csv", index = False)