# Data Preprocessing

### Import Packages

In [1]:
import re
import pandas as pd

### Import Datasets

In [2]:
df = pd.read_csv('reddit_manual_label.csv') 
df

Unnamed: 0,category,subreddit,author,created_date,score,text,positive (1) / negative (0),opinionated (1) / neutral (0),manual_label
0,comment,r/china,SuspiciousStable9649,5/14/2022 3:53,1,"- Workers ??€?can see the ceiling, due to mar...",0.0,1.0,NEGATIVE
1,comment,r/unitedkingdom,Phallic_Entity,11/1/2020 1:21,8,> The country will be broke and in masses of d...,,0.0,NEUTRAL
2,comment,r/usa,dannylenwinn,4/2/2021 16:28,-1,"The U.S. economy added a whopping 916,000 jobs...",,0.0,NEUTRAL
3,comment,r/cscareerquestions,,11/30/2022 21:11,26,”I don’t have cobol mentioned on my resume” ok...,,0.0,NEUTRAL
4,comment,r/cscareerquestions,soprof,11/17/2016 9:41,13,20 years...\nThat's like a century in IT. Quak...,0.0,1.0,NEGATIVE
...,...,...,...,...,...,...,...,...,...
995,post,r/FinancialCareers,Unlikely-Strategy596,1/30/2023 0:09,197,Just got laid off. 1st year IB analyst.I was j...,0.0,1.0,NEGATIVE
996,post,r/FinancialCareers,Unlikely-Strategy596,1/31/2023 22:49,82,Update: just got laid off 1st year IB AnalystJ...,0.0,1.0,NEGATIVE
997,post,r/FinancialCareers,corymathews2011,1/18/2023 15:18,84,Offer rescinded at large bank. Was laid off be...,0.0,1.0,NEGATIVE
998,post,r/FinancialCareers,Akatzman86,1/21/2023 21:34,2,What roles are the first to get laid off in an...,0.0,1.0,NEGATIVE


### Data Cleaning using Regex

In [3]:
# Replace '@' with 'at'
def replaceAt(text):
    return re.sub(r'@', 'at ', text)

# Replace '&' with 'and'
def replaceAnd(text):
    return re.sub(r'[&+]', 'and ', text)

# Remove hyperlinks
def removeHyperlink(text):
    return re.sub(r'https?:\/\/.*[\r\n]*', '', text)

# Remove punctuations
def removePunctuations(text):
    return re.sub(r'[^\w\s]', '', text)

# Remove numbers
def removeNumbers(text):
    return re.sub(r'[0-9]', '', text)

# Replace new line
def replaceNewLine(text):
    return text.replace('\n', ' ').encode("utf-8").decode("utf-8")

# Remove additional space
def removeAdditionalSpace(text):
    return re.sub(r'\s+', ' ', text)

# Data cleaning
def dataCleaning(text):
    text = replaceAt(text)
    text = replaceAnd(text)
    text = removeHyperlink(text)
    text = removePunctuations(text)
    text = removeNumbers(text)
    text = replaceNewLine(text)
    text = removeAdditionalSpace(text)

    return text.lower().strip() # lowercase all words

In [4]:
# Further pre-processing
df['text_clean'] = df['text'].apply(lambda x: dataCleaning(x))
df.dropna(subset=['text_clean'], inplace=True)
df.drop(df[df.text_clean.map(len) < 1].index, inplace=True)
df.drop_duplicates(subset=['text_clean'], inplace=True)
df

Unnamed: 0,category,subreddit,author,created_date,score,text,positive (1) / negative (0),opinionated (1) / neutral (0),manual_label,text_clean
0,comment,r/china,SuspiciousStable9649,5/14/2022 3:53,1,"- Workers ??€?can see the ceiling, due to mar...",0.0,1.0,NEGATIVE,workers can see the ceiling due to market matu...
1,comment,r/unitedkingdom,Phallic_Entity,11/1/2020 1:21,8,> The country will be broke and in masses of d...,,0.0,NEUTRAL,the country will be broke and in masses of deb...
2,comment,r/usa,dannylenwinn,4/2/2021 16:28,-1,"The U.S. economy added a whopping 916,000 jobs...",,0.0,NEUTRAL,the us economy added a whopping jobs last mont...
3,comment,r/cscareerquestions,,11/30/2022 21:11,26,”I don’t have cobol mentioned on my resume” ok...,,0.0,NEUTRAL,i dont have cobol mentioned on my resume okay ...
4,comment,r/cscareerquestions,soprof,11/17/2016 9:41,13,20 years...\nThat's like a century in IT. Quak...,0.0,1.0,NEGATIVE,years thats like a century in it quake and dia...
...,...,...,...,...,...,...,...,...,...,...
995,post,r/FinancialCareers,Unlikely-Strategy596,1/30/2023 0:09,197,Just got laid off. 1st year IB analyst.I was j...,0.0,1.0,NEGATIVE,just got laid off st year ib analysti was just...
996,post,r/FinancialCareers,Unlikely-Strategy596,1/31/2023 22:49,82,Update: just got laid off 1st year IB AnalystJ...,0.0,1.0,NEGATIVE,update just got laid off st year ib analystjus...
997,post,r/FinancialCareers,corymathews2011,1/18/2023 15:18,84,Offer rescinded at large bank. Was laid off be...,0.0,1.0,NEGATIVE,offer rescinded at large bank was laid off bef...
998,post,r/FinancialCareers,Akatzman86,1/21/2023 21:34,2,What roles are the first to get laid off in an...,0.0,1.0,NEGATIVE,what roles are the first to get laid off in an...


### Store Dataset

In [5]:
df.to_csv('test_combined_clean.csv', index=False)