# Data Preprocessing

### Import Packages

In [1]:
import re
import pandas as pd

### Import Datasets

In [2]:
comment_df = pd.read_csv('reddit_comment.csv', encoding='latin') 
post_df = pd.read_csv('reddit_post.csv', encoding='latin') 

In [3]:
comment_df.head()

Unnamed: 0,subreddit,author,created_date,score,text
0,r/china,SuspiciousStable9649,5/14/2022 3:53,1,"- Workers ??can see the ceiling, due to mark..."
1,r/unitedkingdom,Phallic_Entity,11/1/2020 1:21,8,> The country will be broke and in masses of ...
2,r/usa,dannylenwinn,4/2/2021 16:28,-1,"The U.S. economy added a whopping 916,000 job..."
3,r/cscareerquestions,,11/30/2022 21:11,26,??I don??t have cobol mentioned on my resume...
4,r/cscareerquestions,soprof,11/17/2016 9:41,13,20 years...\nThat's like a century in IT. Quak...


In [4]:
post_df.head()

Unnamed: 0,subreddit,author,created_date,score,text
0,r/usa,Barch3,2/15/2023 21:26,0,White House: The GOP??s Current Agenda Would ...
1,r/singapore,patricklhe,1/2/2023 8:26,33,High credit card debt in Singapore not a conce...
2,r/unitedkingdom,allenthalben2,1/2/2023 21:10,517,"UK faces worst and longest recession in G7, sa..."
3,r/unitedkingdom,DrCalFun,1/4/2023 13:27,456,The UK recession will be almost as deep as tha...
4,r/usa,Barch3,1/10/2023 1:38,1,Trump posts all-caps rant demanding 'tough' Re...


### Combine Datasets

In [5]:
comment_df.insert(0, 'category','comment')
post_df.insert(0, 'category','post')

df = pd.concat([comment_df, post_df], axis=0, join="outer", ignore_index=True)
df.head()

Unnamed: 0,category,subreddit,author,created_date,score,text
0,comment,r/china,SuspiciousStable9649,5/14/2022 3:53,1,"- Workers ??can see the ceiling, due to mark..."
1,comment,r/unitedkingdom,Phallic_Entity,11/1/2020 1:21,8,> The country will be broke and in masses of ...
2,comment,r/usa,dannylenwinn,4/2/2021 16:28,-1,"The U.S. economy added a whopping 916,000 job..."
3,comment,r/cscareerquestions,,11/30/2022 21:11,26,??I don??t have cobol mentioned on my resume...
4,comment,r/cscareerquestions,soprof,11/17/2016 9:41,13,20 years...\nThat's like a century in IT. Quak...


### Data Cleaning using Regex

In [6]:
# Replace '@' with 'at'
def replaceAt(text):
    return re.sub(r'@', 'at ', text)

# Replace '&' with 'and'
def replaceAnd(text):
    return re.sub(r'[&+]', 'and ', text)

# Remove hyperlinks
def removeHyperlink(text):
    return re.sub(r'https?:\/\/.*[\r\n]*', '', text)

# Remove punctuations
def removePunctuations(text):
    return re.sub(r'[^\w\s]', '', text)

# Remove numbers
def removeNumbers(text):
    return re.sub(r'[0-9]', '', text)

# Replace new line
def replaceNewLine(text):
    return text.replace('\n', ' ').encode("utf-8").decode("utf-8")

# Remove additional space
def removeAdditionalSpace(text):
    return re.sub(r'\s+', ' ', text)

# Data cleaning
def dataCleaning(text):
    text = replaceAt(text)
    text = replaceAnd(text)
    text = removeHyperlink(text)
    text = removePunctuations(text)
    text = removeNumbers(text)
    text = replaceNewLine(text)
    text = removeAdditionalSpace(text)

    return text.lower().strip() # lowercase all words

In [7]:
# Further pre-processing
df['text_clean'] = df['text'].apply(lambda x: dataCleaning(x))
df.dropna(subset=['text_clean'], inplace=True)
df.drop(df[df.text_clean.map(len) < 1].index, inplace=True)
df.drop_duplicates(subset=['text_clean'], inplace=True)
df

Unnamed: 0,category,subreddit,author,created_date,score,text,text_clean
0,comment,r/china,SuspiciousStable9649,5/14/2022 3:53,1,"- Workers ??can see the ceiling, due to mark...",workers can see the ceiling due to market matu...
1,comment,r/unitedkingdom,Phallic_Entity,11/1/2020 1:21,8,> The country will be broke and in masses of ...,the country will be broke and in masses of deb...
2,comment,r/usa,dannylenwinn,4/2/2021 16:28,-1,"The U.S. economy added a whopping 916,000 job...",the us economy added a whopping jobs last mont...
3,comment,r/cscareerquestions,,11/30/2022 21:11,26,??I don??t have cobol mentioned on my resume...,i dont have cobol mentioned on my resume okay ...
4,comment,r/cscareerquestions,soprof,11/17/2016 9:41,13,20 years...\nThat's like a century in IT. Quak...,years thats like a century in it quake and dia...
...,...,...,...,...,...,...,...
21153,post,r/FinancialCareers,solo_dol0,4/3/2017 16:03,10,Laid off while interviewing Looking for some j...,laid off while interviewing looking for some j...
21154,post,r/FinancialCareers,sharky_chups,12/4/2015 15:24,10,How to get laid off? Anyone have any good advi...,how to get laid off anyone have any good advic...
21155,post,r/FinancialCareers,runitup30,10/8/2016 15:23,10,S&T: Laid off after 2 years Leaving this purpo...,sand t laid off after years leaving this purpo...
21156,post,r/FinancialCareers,stmajor339329,10/18/2016 20:09,12,Where do people that are laid off mid career g...,where do people that are laid off mid career g...


### Store Dataset

In [9]:
df.to_csv('reddit_combined_clean.csv', index=False)