# Data Splitting

### Import Packages

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

### Load Dataset

In [2]:
comment_test_df = pd.read_csv('comment_test.csv') 
comment_test_df = comment_test_df[:600]
comment_test_df.insert(0, 'category','comment')
comment_test_df

Unnamed: 0,category,subreddit,author,created_date,score,text,positive (1) / negative (0),opinionated (1) / neutral (0)
0,comment,r/china,SuspiciousStable9649,5/14/2022 3:53,1,"- Workers ??€?can see the ceiling, due to mar...",0.0,1
1,comment,r/unitedkingdom,Phallic_Entity,11/1/2020 1:21,8,> The country will be broke and in masses of d...,,0
2,comment,r/usa,dannylenwinn,4/2/2021 16:28,-1,"The U.S. economy added a whopping 916,000 jobs...",,0
3,comment,r/cscareerquestions,,11/30/2022 21:11,26,”I don’t have cobol mentioned on my resume” ok...,,0
4,comment,r/cscareerquestions,soprof,11/17/2016 9:41,13,20 years...\nThat's like a century in IT. Quak...,0.0,1
...,...,...,...,...,...,...,...,...
595,comment,r/cscareerquestions,michaelochurch,9/9/2015 12:09,1,"> On top of that, the office atmosphere felt q...",,0
596,comment,r/cscareerquestions,kidcurry96,5/7/2022 20:01,1,> one company for 3 years jeez\nIssue is not w...,,0
597,comment,r/cscareerquestions,Pand9,11/10/2017 22:19,3,> or keeping a blog of your learnings\nCould y...,,0
598,comment,r/cscareerquestions,GhostBond,5/1/2020 0:20,2,> Our industry especially is way over saturate...,,0


In [3]:
post_test_df = pd.read_csv('post_test.csv') 
post_test_df = post_test_df[:400]
post_test_df.insert(0, 'category','post')
post_test_df

Unnamed: 0,category,subreddit,author,created_date,score,text,positive (1) / negative (0),opinionated (1) / neutral (0)
0,post,r/usa,Barch3,2/15/2023 21:26,0,White House: The GOP’s Current Agenda Would Ad...,0.0,1.0
1,post,r/singapore,patricklhe,1/2/2023 8:26,33,High credit card debt in Singapore not a conce...,1.0,1.0
2,post,r/unitedkingdom,allenthalben2,1/2/2023 21:10,517,"UK faces worst and longest recession in G7, sa...",0.0,1.0
3,post,r/unitedkingdom,DrCalFun,1/4/2023 13:27,456,The UK recession will be almost as deep as tha...,0.0,1.0
4,post,r/usa,Barch3,1/10/2023 1:38,1,Trump posts all-caps rant demanding 'tough' Re...,0.0,1.0
...,...,...,...,...,...,...,...,...
395,post,r/FinancialCareers,Unlikely-Strategy596,1/30/2023 0:09,197,Just got laid off. 1st year IB analyst.I was j...,0.0,1.0
396,post,r/FinancialCareers,Unlikely-Strategy596,1/31/2023 22:49,82,Update: just got laid off 1st year IB AnalystJ...,0.0,1.0
397,post,r/FinancialCareers,corymathews2011,1/18/2023 15:18,84,Offer rescinded at large bank. Was laid off be...,0.0,1.0
398,post,r/FinancialCareers,Akatzman86,1/21/2023 21:34,2,What roles are the first to get laid off in an...,0.0,1.0


### Manual Label

In [4]:
df = pd.concat([comment_test_df, post_test_df], ignore_index=True) 
df['manual_label'] = df['positive (1) / negative (0)'].map(lambda x: 'POSITIVE' if x == 1 else 'NEGATIVE' if x == 0 else 'NEUTRAL')
df

Unnamed: 0,category,subreddit,author,created_date,score,text,positive (1) / negative (0),opinionated (1) / neutral (0),manual_label
0,comment,r/china,SuspiciousStable9649,5/14/2022 3:53,1,"- Workers ??€?can see the ceiling, due to mar...",0.0,1.0,NEGATIVE
1,comment,r/unitedkingdom,Phallic_Entity,11/1/2020 1:21,8,> The country will be broke and in masses of d...,,0.0,NEUTRAL
2,comment,r/usa,dannylenwinn,4/2/2021 16:28,-1,"The U.S. economy added a whopping 916,000 jobs...",,0.0,NEUTRAL
3,comment,r/cscareerquestions,,11/30/2022 21:11,26,”I don’t have cobol mentioned on my resume” ok...,,0.0,NEUTRAL
4,comment,r/cscareerquestions,soprof,11/17/2016 9:41,13,20 years...\nThat's like a century in IT. Quak...,0.0,1.0,NEGATIVE
...,...,...,...,...,...,...,...,...,...
995,post,r/FinancialCareers,Unlikely-Strategy596,1/30/2023 0:09,197,Just got laid off. 1st year IB analyst.I was j...,0.0,1.0,NEGATIVE
996,post,r/FinancialCareers,Unlikely-Strategy596,1/31/2023 22:49,82,Update: just got laid off 1st year IB AnalystJ...,0.0,1.0,NEGATIVE
997,post,r/FinancialCareers,corymathews2011,1/18/2023 15:18,84,Offer rescinded at large bank. Was laid off be...,0.0,1.0,NEGATIVE
998,post,r/FinancialCareers,Akatzman86,1/21/2023 21:34,2,What roles are the first to get laid off in an...,0.0,1.0,NEGATIVE


In [5]:
df.to_csv('reddit_manual_label.csv', index=False)

### Train-test Split

In [6]:
df1 = pd.read_csv('test_combined_clean.csv', encoding='latin') 
df1

Unnamed: 0,category,subreddit,author,created_date,score,text,positive (1) / negative (0),opinionated (1) / neutral (0),manual_label,text_clean
0,comment,r/china,SuspiciousStable9649,5/14/2022 3:53,1,"- Workers ??â¬?can see the ceiling, due to m...",0.0,1.0,NEGATIVE,workers can see the ceiling due to market matu...
1,comment,r/unitedkingdom,Phallic_Entity,11/1/2020 1:21,8,> The country will be broke and in masses of d...,,0.0,NEUTRAL,the country will be broke and in masses of deb...
2,comment,r/usa,dannylenwinn,4/2/2021 16:28,-1,"The U.S. economy added a whopping 916,000 jobs...",,0.0,NEUTRAL,the us economy added a whopping jobs last mont...
3,comment,r/cscareerquestions,,11/30/2022 21:11,26,âI donât have cobol mentioned on my resume...,,0.0,NEUTRAL,i dont have cobol mentioned on my resume okay ...
4,comment,r/cscareerquestions,soprof,11/17/2016 9:41,13,20 years...\nThat's like a century in IT. Quak...,0.0,1.0,NEGATIVE,years thats like a century in it quake and dia...
...,...,...,...,...,...,...,...,...,...,...
991,post,r/FinancialCareers,Unlikely-Strategy596,1/30/2023 0:09,197,Just got laid off. 1st year IB analyst.I was j...,0.0,1.0,NEGATIVE,just got laid off st year ib analysti was just...
992,post,r/FinancialCareers,Unlikely-Strategy596,1/31/2023 22:49,82,Update: just got laid off 1st year IB AnalystJ...,0.0,1.0,NEGATIVE,update just got laid off st year ib analystjus...
993,post,r/FinancialCareers,corymathews2011,1/18/2023 15:18,84,Offer rescinded at large bank. Was laid off be...,0.0,1.0,NEGATIVE,offer rescinded at large bank was laid off bef...
994,post,r/FinancialCareers,Akatzman86,1/21/2023 21:34,2,What roles are the first to get laid off in an...,0.0,1.0,NEGATIVE,what roles are the first to get laid off in an...


In [7]:
df2 = pd.read_csv('reddit_combined_clean_label.csv', encoding='latin') 
df2

Unnamed: 0,category,subreddit,author,created_date,score,text,text_clean,label_1,label_2,label_3,final_label
0,comment,r/china,SuspiciousStable9649,5/14/2022 3:53,1,"- Workers ?????????can see the ceiling, due ...",workers can see the ceiling due to market matu...,NEGATIVE,POSITIVE,NEGATIVE,NEGATIVE
1,comment,r/unitedkingdom,Phallic_Entity,11/1/2020 1:21,8,> The country will be broke and in masses of ...,the country will be broke and in masses of deb...,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE
2,comment,r/usa,dannylenwinn,4/2/2021 16:28,-1,"The U.S. economy added a whopping 916,000 job...",the us economy added a whopping jobs last mont...,NEGATIVE,POSITIVE,POSITIVE,POSITIVE
3,comment,r/cscareerquestions,,11/30/2022 21:11,26,?????????I don?????????t have cobol mentione...,i dont have cobol mentioned on my resume okay ...,POSITIVE,POSITIVE,NEUTRAL,POSITIVE
4,comment,r/cscareerquestions,soprof,11/17/2016 9:41,13,20 years...\nThat's like a century in IT. Quak...,years thats like a century in it quake and dia...,POSITIVE,POSITIVE,NEUTRAL,POSITIVE
...,...,...,...,...,...,...,...,...,...,...,...
20553,post,r/FinancialCareers,solo_dol0,4/3/2017 16:03,10,Laid off while interviewing Looking for some j...,laid off while interviewing looking for some j...,POSITIVE,POSITIVE,NEUTRAL,POSITIVE
20554,post,r/FinancialCareers,sharky_chups,12/4/2015 15:24,10,How to get laid off? Anyone have any good advi...,how to get laid off anyone have any good advic...,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE
20555,post,r/FinancialCareers,runitup30,10/8/2016 15:23,10,S&T: Laid off after 2 years Leaving this purpo...,sand t laid off after years leaving this purpo...,POSITIVE,NEUTRAL,NEGATIVE,NEUTRAL
20556,post,r/FinancialCareers,stmajor339329,10/18/2016 20:09,12,Where do people that are laid off mid career g...,where do people that are laid off mid career g...,NEGATIVE,POSITIVE,NEGATIVE,NEGATIVE


In [8]:
train_test_df = pd.merge(df2, df1[['text_clean','manual_label']], left_on='text_clean', right_on='text_clean', how='left')
train_test_df['label_2'].map(lambda x: train_test_df['manual_label'] if train_test_df['manual_label'] is not None else train_test_df['label_2'])
train_test_df.drop(columns=['manual_label'], inplace=True)

# Apply scoring
label_df = train_test_df[['label_1', 'label_2', 'label_3']].copy()
label_df['label_1_num'] = label_df['label_1'].map(lambda x: 1 if x == 'POSITIVE' else 0 if x == 'NEUTRAL' else -1)
label_df['label_2_num'] = label_df['label_2'].map(lambda x: 1 if x == 'POSITIVE' else 0 if x == 'NEUTRAL' else -1)
label_df['label_3_num'] = label_df['label_3'].map(lambda x: 1 if x == 'POSITIVE' else 0 if x == 'NEUTRAL' else -1)

label_df['final_label_num'] = label_df['label_1_num'] + label_df['label_2_num'] + label_df['label_3_num']
label_df['final_label'] = label_df['final_label_num'].map(lambda x: 'POSITIVE' if x > 0 else 'NEUTRAL' if x == 0 else 'NEGATIVE')

train_test_df['final_label'] = label_df['final_label']
train_test_df.head()

Unnamed: 0,category,subreddit,author,created_date,score,text,text_clean,label_1,label_2,label_3,final_label
0,comment,r/china,SuspiciousStable9649,5/14/2022 3:53,1,"- Workers ?????????can see the ceiling, due ...",workers can see the ceiling due to market matu...,NEGATIVE,POSITIVE,NEGATIVE,NEGATIVE
1,comment,r/unitedkingdom,Phallic_Entity,11/1/2020 1:21,8,> The country will be broke and in masses of ...,the country will be broke and in masses of deb...,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE
2,comment,r/usa,dannylenwinn,4/2/2021 16:28,-1,"The U.S. economy added a whopping 916,000 job...",the us economy added a whopping jobs last mont...,NEGATIVE,POSITIVE,POSITIVE,POSITIVE
3,comment,r/cscareerquestions,,11/30/2022 21:11,26,?????????I don?????????t have cobol mentione...,i dont have cobol mentioned on my resume okay ...,POSITIVE,POSITIVE,NEUTRAL,POSITIVE
4,comment,r/cscareerquestions,soprof,11/17/2016 9:41,13,20 years...\nThat's like a century in IT. Quak...,years thats like a century in it quake and dia...,POSITIVE,POSITIVE,NEUTRAL,POSITIVE


In [9]:
train_comment_df = train_test_df[train_test_df['category'] == 'comment'][600:]
train_post_df = train_test_df[train_test_df['category'] == 'post'][400:]

test_comment_df = train_test_df[train_test_df['category'] == 'comment'][:600]
test_post_df = train_test_df[train_test_df['category'] == 'post'][:400]

In [10]:
train_df = pd.concat([train_comment_df, train_post_df], ignore_index=True) 
test_df = pd.concat([test_comment_df, test_post_df], ignore_index=True)

train_df.insert(0, 'dataset','train')
test_df.insert(0, 'dataset','test')

In [11]:
final_df = pd.concat([train_df, test_df], ignore_index=True)
final_df

Unnamed: 0,dataset,category,subreddit,author,created_date,score,text,text_clean,label_1,label_2,label_3,final_label
0,train,comment,r/cscareerquestions,Linooney,4/30/2020 1:58,20,> People wait for existing faculty to die to h...,people wait for existing faculty to die to hav...,POSITIVE,POSITIVE,NEUTRAL,POSITIVE
1,train,comment,r/cscareerquestions,Dunan,4/30/2020 4:07,1,"> People who are being layed off, are they sof...",people who are being layed off are they softwa...,POSITIVE,POSITIVE,NEUTRAL,POSITIVE
2,train,comment,r/cscareerquestions,wtfnowdoIdo,1/2/2021 0:08,0,"> Personally, I switched jobs after 1.5 years ...",personally i switched jobs after years of expe...,POSITIVE,POSITIVE,NEUTRAL,POSITIVE
3,train,comment,r/cscareerquestions,nyamuk91,4/28/2020 16:31,6,"> PM's don't do performance reviews, that's fo...",pms dont do performance reviews thats for mana...,NEUTRAL,POSITIVE,NEUTRAL,POSITIVE
4,train,comment,r/cscareerquestions,OldSWEThrowaway,7/10/2021 2:19,-1,> pretty hard\nMore like impossible due to leg...,pretty hard more like impossible due to legal ...,POSITIVE,NEGATIVE,NEGATIVE,NEGATIVE
...,...,...,...,...,...,...,...,...,...,...,...,...
20553,test,post,r/FinancialCareers,Unlikely-Strategy596,1/31/2023 22:49,82,Update: just got laid off 1st year IB Analyst ...,update just got laid off st year ib analyst ju...,POSITIVE,POSITIVE,NEGATIVE,POSITIVE
20554,test,post,r/FinancialCareers,corymathews2011,1/18/2023 15:18,84,Offer rescinded at large bank. Was laid off be...,offer rescinded at large bank was laid off bef...,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE
20555,test,post,r/FinancialCareers,Akatzman86,1/21/2023 21:34,2,What roles are the first to get laid off in an...,what roles are the first to get laid off in an...,POSITIVE,POSITIVE,NEUTRAL,POSITIVE
20556,test,post,r/FinancialCareers,lilac_congac,2/15/2023 18:27,2,what would you do if you were laid off from a ...,what would you do if you were laid off from a ...,NEGATIVE,POSITIVE,NEGATIVE,NEGATIVE


In [12]:
final_df.to_csv('reddit_combined_clean_label_split.csv', index=False)