In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
df_reddit = pd.read_csv('raw_data/reddit_comment.csv')
df_reddit

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...
...,...,...,...,...
39995,1753918954,neutral,showMe_Heaven,@JohnLloydTaylor
39996,1753919001,love,drapeaux,Happy Mothers Day All my love
39997,1753919005,love,JenniRox,Happy Mother's Day to all the mommies out ther...
39998,1753919043,happiness,ipdaman1,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...


In [3]:
df_reddit['sentiment'].unique() # Searching unique sentiments

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

##### ['empty'] = NULL
##### ['sadness', 'worry', 'hate', 'boredom', 'anger'] = Negative
##### ['enthusiasm', 'surprise', 'love', 'fun', 'happiness', 'relief'] = Positive
##### ['neutral'] = Neutral

In [4]:
df_reddit.sample(10)

Unnamed: 0,tweet_id,sentiment,author,content
6690,1961416663,neutral,Partyplanqueen,@LisaTalkingTots sorry to hear your news
2381,1957532215,worry,phillipsdj,Being dragged round Ikea this morning Bad times!
16573,1965390017,worry,EwMedia,"Had a great day, apart from the fact that I bo..."
32658,1752353909,neutral,SedzOz,@butterflykate Who've u noticed now Kate ?
34464,1752873041,love,lilladylilone,"ugh my head, when will all the headaches stop...."
18065,1965875074,worry,big_red724,is having trouble breathing through the pain.....
759,1957141723,neutral,nico80013,@soonseeofpred becuz you braggin
35570,1753136190,neutral,wineott,@alisongo i heard it is not illegal unless you...
27900,1695964127,love,Deenanemily,@WSPNews Happy Monday to you as well! After a...
24137,1694769757,happiness,tweryll,@monicafrancesca congratulations! woootwoooo! ...


In [5]:
df_reddit[df_reddit.sentiment == 'empty'].count() # calculating empty sentiments count

tweet_id     827
sentiment    827
author       827
content      827
dtype: int64

### Removing Null Values

In [6]:
df_reddit.sentiment = df_reddit['sentiment'].replace('empty', np.NaN) # replacing empty values to null
df_reddit.isnull().sum()

tweet_id       0
sentiment    827
author         0
content        0
dtype: int64

In [7]:
df_reddit = df_reddit.dropna()
df_reddit.isnull().sum()

tweet_id     0
sentiment    0
author       0
content      0
dtype: int64

### Removing Neutral Comments

In [8]:
df_reddit[df_reddit.sentiment == 'neutral'].count() # calculating neutral sentiments

tweet_id     8638
sentiment    8638
author       8638
content      8638
dtype: int64

In [9]:
df_reddit = df_reddit[df_reddit.sentiment != 'neutral'] # replacing dataframe without neutral sentiment values
df_reddit.sample(10)

Unnamed: 0,tweet_id,sentiment,author,content
25018,1694978622,worry,heysteffi,@fobchick08 You lucky girl. Tell me all about ...
3106,1957701853,sadness,TarrynLeighEia,"@SamGrierson awwww.... , ah well at least I..."
35044,1753009556,worry,ladyluckbug,Can't sleep. Fucking morning will come to soon...
35764,1753176703,happiness,Bronwyn,Also I was conviced to do a happy dance and th...
7726,1961871394,happiness,rkalajian,@mizzyalana Very nice! Bea and I are so bumme...
12290,1963524741,happiness,kenlad,"nice night, should be golfing"
11655,1963308203,sadness,BARBiE_BABiE,@AgesTheGreat UGH! &amp; DiDNT HiT ME UP? OK ...
21940,1694330012,happiness,JessTemby,Has had a really good bank hoilday
3879,1960128438,worry,JulieBramman,went to Marsee-they changed the store around &...
38703,1753696021,love,kethni,"Exchange story gone, yaaaay! On with Sweet Cha..."


In [10]:
type(df_reddit['sentiment'])

pandas.core.series.Series

### Grouping Sentiments into positive and negative 

In [11]:
%%time

negative_sentiments = ['sadness', 'worry', 'hate', 'boredom', 'anger'] 
positive_sentiments = ['enthusiasm', 'surprise', 'love', 'fun', 'happiness', 'relief'] 
neutral_sentiments = ['neutral']

# sentiments = {'negative': negative_sentiments, 'positive': positive_sentiments, 'neutral': neutral_sentiments}

def grouping_sentiments(col):
    for item in negative_sentiments:
        col = col.replace(item, -1)
    
    for item in positive_sentiments:
        col = col.replace(item, 1)
        
    return col

df_reddit['label'] = grouping_sentiments(df_reddit['sentiment'])
df_reddit.head(5)

CPU times: user 13.7 ms, sys: 1.65 ms, total: 15.3 ms
Wall time: 14.5 ms


Unnamed: 0,tweet_id,sentiment,author,content,label
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...,-1
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...,-1
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!,1
5,1956968477,worry,xxxPEACHESxxx,Re-pinging @ghostridah14: why didn't you go to...,-1
6,1956968487,sadness,ShansBee,"I should be sleep, but im not! thinking about ...",-1


In [12]:
# Renaming the content features to text
df_reddit = df_reddit.rename(columns={'content':'text'})
df_reddit

Unnamed: 0,tweet_id,sentiment,author,text,label
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...,-1
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...,-1
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!,1
5,1956968477,worry,xxxPEACHESxxx,Re-pinging @ghostridah14: why didn't you go to...,-1
6,1956968487,sadness,ShansBee,"I should be sleep, but im not! thinking about ...",-1
...,...,...,...,...,...
39994,1753918900,happiness,courtside101,Succesfully following Tayla!!,1
39996,1753919001,love,drapeaux,Happy Mothers Day All my love,1
39997,1753919005,love,JenniRox,Happy Mother's Day to all the mommies out ther...,1
39998,1753919043,happiness,ipdaman1,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...,1


### Slicing Last two columns, Text and Label

In [13]:
%%time 
df = df_reddit.iloc[:,-2:]
df

CPU times: user 692 µs, sys: 41 µs, total: 733 µs
Wall time: 720 µs


Unnamed: 0,text,label
1,Layin n bed with a headache ughhhh...waitin o...,-1
2,Funeral ceremony...gloomy friday...,-1
3,wants to hang out with friends SOON!,1
5,Re-pinging @ghostridah14: why didn't you go to...,-1
6,"I should be sleep, but im not! thinking about ...",-1
...,...,...
39994,Succesfully following Tayla!!,1
39996,Happy Mothers Day All my love,1
39997,Happy Mother's Day to all the mommies out ther...,1
39998,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...,1


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30535 entries, 1 to 39999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    30535 non-null  object
 1   label   30535 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 715.7+ KB


In [15]:
df[1:].to_csv('formatted_data/reddit_comment.csv')