In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re

from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
df_1 = pd.read_csv('./datasets/asksingapore.csv')
df_1

Unnamed: 0,subreddit,selftext,title
0,askSingapore,[removed],Would you choose to be an average joe + happy ...
1,askSingapore,[removed],Anything I can do if I just found out the sala...
2,askSingapore,[removed],"I am turning 30, and I wanna do masters in Eur..."
3,askSingapore,,Can anyone tell me anything about the house th...
4,askSingapore,[removed],Cheap podiatrist in Singapore?
...,...,...,...
9971,askSingapore,"Hey all, \n\nI graduate from the University of...",Working abroad as a fresh graduate from the US?
9972,askSingapore,My Muslim friend would like to attend church s...,Are there agreements in place by the Muslim co...
9973,askSingapore,"Looking to download music using torrent, just ...",Does anyone here uses torrent? Is it illegal?
9974,askSingapore,"For example, if an uncle is leaning against a ...",Is it legal to insult someone in public if the...


In [3]:
df_2 = pd.read_csv('./datasets/singapore.csv',  encoding='ISO-8859-1')
df_2

Unnamed: 0,subreddit,selftext,title
0,singapore,,Singapore leaders congratulate Xi Jinping on h...
1,singapore,,Woman caught by traffic police driving erratic...
2,singapore,,Can anyone tell me anything about the house th...
3,singapore,,Incredible Miniature Kuala Lumpur's Landmarks ...
4,singapore,,Singapore's Sea cuts more jobs at e-commerce u...
...,...,...,...
9980,singapore,,ACRA Singapore - The Registrar of Companies (R...
9981,singapore,"Another quick question as well, as a fresh uni...",How real is the glass ceiling for non-scholars...
9982,singapore,http://www.gv.com.sg/GVMovieDetails#/movie/339...,Japan's #1 film of the year. Kimi no na wa (Yo...
9983,singapore,[deleted],Experience Mars Event by National Geographic a...


In [17]:
# Combine r\askSingapore and r\Singapore posts in 50:50 proportions

df = pd.concat([df_1[:9990], df_2[:9990]]).reset_index(drop=True)
df

Unnamed: 0,subreddit,selftext,title
0,askSingapore,[removed],Would you choose to be an average joe + happy ...
1,askSingapore,[removed],Anything I can do if I just found out the sala...
2,askSingapore,[removed],"I am turning 30, and I wanna do masters in Eur..."
3,askSingapore,,Can anyone tell me anything about the house th...
4,askSingapore,[removed],Cheap podiatrist in Singapore?
...,...,...,...
19956,singapore,,ACRA Singapore - The Registrar of Companies (R...
19957,singapore,"Another quick question as well, as a fresh uni...",How real is the glass ceiling for non-scholars...
19958,singapore,http://www.gv.com.sg/GVMovieDetails#/movie/339...,Japan's #1 film of the year. Kimi no na wa (Yo...
19959,singapore,[deleted],Experience Mars Event by National Geographic a...


In [6]:
# Replace '[removed]' with whitespaces
df['selftext'].replace({'[removed]': ''}, inplace=True)
df['selftext']

# Replace np.nan with str(nan) before concatenating title+selftext  (nan to be removed later)
df.fillna('nan', inplace=True)

# Concat selftext + title into new column
df['all_text'] = df['title']+' '+df['selftext']   # str(selftext) as 1398 rows are NaN which will result in the corresponding loss of the 'title' data
df['all_text'][0]

'Would you choose to be an average joe + happy life with gf/wife OR single corporate high flyer who does not have personal time to date? '

In [7]:
# Check for null values

df.isnull().sum()

subreddit    0
selftext     0
title        0
all_text     0
dtype: int64

In [8]:
# Create function to automate word-preprocessing

def clean_text(text):
    # Remove HTML elements e.g. tags
    review_text = BeautifulSoup(text).get_text()    
    
    # Remove non-alpha characters and nan
    letters_only = re.sub('[^a-zA-Z]|nan', " ", review_text)
    
    
    # Convert to lower case, split into individual words
    words = letters_only.lower().split()
    
    # set(stopwords) to make faster search
    stops = set(stopwords.words('english'))
    
    # Remove stopwords
    meaningful_words = [w for w in words if not w in stops]

    # Stem words
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(w) for w in meaningful_words]
    
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(w) for w in stemmed_words]   # doesnt add value in this case as words are more colloquial
    
    # Join back the words into one string separated by space
    return(" ".join(lemmatized_words))     # str_separator.join(iterable_string)
    
    

In [9]:
# Create new col to store cleaned text for NLP stage

df['clean_text'] = df['all_text'].map(lambda x: clean_text(x))
df.head(10)



Unnamed: 0,subreddit,selftext,title,all_text,clean_text
0,askSingapore,,Would you choose to be an average joe + happy ...,Would you choose to be an average joe + happy ...,would choos averag joe happi life gf wife sing...
1,askSingapore,,Anything I can do if I just found out the sala...,Anything I can do if I just found out the sala...,anyth found salari rang structur offer employ ...
2,askSingapore,,"I am turning 30, and I wanna do masters in Eur...","I am turning 30, and I wanna do masters in Eur...",turn wanna master europ commit career suicid
3,askSingapore,,Can anyone tell me anything about the house th...,Can anyone tell me anything about the house th...,anyon tell anyth hous photo taken singapor hou...
4,askSingapore,,Cheap podiatrist in Singapore?,Cheap podiatrist in Singapore?,cheap podiatrist singapor
5,askSingapore,Anyone participated in the Playstation Good Wi...,Playstation Good Wish Hunting Discount Code?,Playstation Good Wish Hunting Discount Code? A...,playstat good wish hunt discount code anyon pa...
6,askSingapore,Do companies resume scanning leave out applica...,"Tech job, HR","Tech job, HR Do companies resume scanning leav...",tech job hr compani resum scan leav applic wit...
7,askSingapore,,Anything I can do if I just found out the sala...,Anything I can do if I just found out the sala...,anyth found salari rang structur offer employ ...
8,askSingapore,tdlr: careful of creepy uncle(says he is uncle...,WARNING!! ppl living in Jurong east (7 day Adv...,WARNING!! ppl living in Jurong east (7 day Adv...,warn ppl live jurong east day adventist area t...
9,askSingapore,Heard apparently you can take dry foods (e.g. ...,is food allowed after security when boarding a...,is food allowed after security when boarding a...,food allow secur board flight sg heard appar t...


In [11]:
# Convert subreddit into binary variables where r\askSingapore=1, r\Singapore=0

df['subreddit_label'] = df['subreddit'].map(lambda x: 1 if x=='askSingapore' else 0 if x=='singapore' else 'nan')
df[df['subreddit_label']=='nan']  # these are irrelevant aircon ads

Unnamed: 0,subreddit,selftext,title,all_text,clean_text,subreddit_label
10058,u_coolcare-singapore,[Â **#Airconservicesingapore** ](https://cool...,Aircon servicing,Aircon servicing [Â **#Airconservicesingapore...,aircon servic airconservicesingapor http coolc...,
10145,u_yarana-singapore,&amp;#x200B;\n\n[What is the history of Indian...,What is the history of Indian food in Singapore?,What is the history of Indian food in Singapor...,histori indian food singapor x b histori india...,
10253,u_coolcare-singapore,Cool Care Aircon provides professional [aircon...,Aircon Installation,Aircon Installation Cool Care Aircon provides ...,aircon instal cool care aircon provid professi...,
10419,u_coolcare-singapore,[#airconinstallation](https://coolcare.com.sg/...,Aircon Installation Singapore,Aircon Installation Singapore [#airconinstalla...,aircon instal singapor airconinstal http coolc...,
10421,u_coolcare-singapore,[#airconservice:](https://coolcare.com.sg/) CO...,Aircon service Singapore,Aircon service Singapore [#airconservice:](htt...,aircon servic singapor airconservic http coolc...,
...,...,...,...,...,...,...
18269,u_coolcare-singapore,[#airconinstallation](https://coolcare.com.sg/...,AIRCON INSTALLATION,AIRCON INSTALLATION [#airconinstallation](http...,aircon instal airconinstal http coolcar com sg...,
18291,u_coolcare-singapore,Get [**Daikin aircon**](https://coolcare.com....,Daikin aircon,Daikin aircon Get [**Daikin aircon**](https:/...,daikin aircon get daikin aircon http coolcar c...,
18292,u_coolcare-singapore,Get [**Mitsubishi aircon**](https://coolcare....,Mitsubishi aircon,Mitsubishi aircon Get [**Mitsubishi aircon**]...,mitsubishi aircon get mitsubishi aircon http c...,
18335,u_PalFish-Singapore,\n\nNhÃ¢n dá»p ká»· niá»m sinh nháº­t 8 nÄ...,PHÃNG Sá»° Äá»C QUYá»N ðððð ð...,PHÃNG Sá»° Äá»C QUYá»N ðððð ð...,ph ng c quy n n n ng ti ng anh tr em tr c tuy ...,


In [14]:
# Drop 157 rows where subreddit_label='nan' which are irrelevant aircon ads

df.drop(df[df['subreddit_label']=='nan'].index, inplace=True)
df.shape

(19804, 6)

In [16]:
df.to_csv('./datasets/cleaned_reddit_posts.csv', index=False)