In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
df_reddit = pd.read_csv('raw_data/reddit_comment.csv')
df_reddit

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...
...,...,...,...,...
39995,1753918954,neutral,showMe_Heaven,@JohnLloydTaylor
39996,1753919001,love,drapeaux,Happy Mothers Day All my love
39997,1753919005,love,JenniRox,Happy Mother's Day to all the mommies out ther...
39998,1753919043,happiness,ipdaman1,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...


In [3]:
df_reddit['sentiment'].unique() # Searching unique sentiments

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

##### ['empty'] = NULL
##### ['sadness', 'worry', 'hate', 'boredom', 'anger'] = Negative
##### ['enthusiasm', 'surprise', 'love', 'fun', 'happiness', 'relief'] = Positive
##### ['neutral'] = Neutral

In [4]:
df_reddit.sample(10)

Unnamed: 0,tweet_id,sentiment,author,content
32579,1752350702,worry,Shinigumi,"@schappie That's just weird... :\ Oh, and wha..."
10944,1963065595,empty,PRiNCESSNAiSA,gotta do my ewrt outline then tennis then work...
720,1957133079,neutral,sukasukaariska,"Damn, hi Facebook ? what's wrong with you ? ca..."
9063,1962410358,sadness,AmeenaS,Tonight the last night of Jay Leno!!! am gonn...
16603,1965399026,worry,loyaleagle,went to bestbuy today and found that they had ...
17167,1965546055,worry,sofiaramirezU,En el salon.. Que caliente el blower ouch
35067,1753028785,enthusiasm,Ahrae,Night peeps. Hope you all had a great day! Unt...
18597,1966029483,sadness,gq637,my back hurts have a heating pad on it... stu...
10903,1963051039,enthusiasm,ButMadNNW,@NathanFillion You're going to the UK?! Take m...
12446,1963573762,worry,kaemdros,Every pair of jeans I own nowadays is very tig...


In [5]:
df_reddit[df_reddit.sentiment == 'empty'].count() # calculating empty sentiments count

tweet_id     827
sentiment    827
author       827
content      827
dtype: int64

### Removing Null Values

In [6]:
df_reddit.sentiment = df_reddit['sentiment'].replace('empty', np.NaN) # replacing empty values to null
df_reddit.isnull().sum()

tweet_id       0
sentiment    827
author         0
content        0
dtype: int64

In [7]:
df_reddit = df_reddit.dropna()
df_reddit.isnull().sum()

tweet_id     0
sentiment    0
author       0
content      0
dtype: int64

### Removing Neutral Comments

In [8]:
df_reddit[df_reddit.sentiment == 'neutral'].count() # calculating neutral sentiments

tweet_id     8638
sentiment    8638
author       8638
content      8638
dtype: int64

In [9]:
df_reddit = df_reddit[df_reddit.sentiment != 'neutral'] # replacing dataframe without neutral sentiment values
df_reddit.sample(10)

Unnamed: 0,tweet_id,sentiment,author,content
30829,1751749196,love,melodieus,"@sweetangieollie ooh a storm, that's always we..."
6805,1961453470,sadness,laurnie36,my apartment is emptying out more &amp; more e...
39156,1753774190,happiness,tsarnick,"@MissxMarisa hahaha, it's *massive* compared t..."
5311,1960752961,sadness,Xyense,&quot;I only think of you as breaking my heart...
21302,1694203946,happiness,nic0lepaula,I forgot I have lumpia and pancit in my fridge...
8160,1962056876,worry,ghurm,zoita had a cardio apt. 2day. doc says her hol...
17934,1965836043,worry,RecipeGirl,@nofearentertain Just read up on your Mom Ho...
1968,1957444593,sadness,PrincessEleasha,is seriously heartbroken!!!
7905,1961923067,happiness,silowyi,Totally been too long since I updated. Saw St...
3498,1958061020,worry,martijnwillems,isn't singing Oh what a beautiful day though i...


In [10]:
type(df_reddit['sentiment'])

pandas.core.series.Series

### Grouping Sentiments into positive and negative 

In [11]:
%%time

negative_sentiments = ['sadness', 'worry', 'hate', 'boredom', 'anger'] 
positive_sentiments = ['enthusiasm', 'surprise', 'love', 'fun', 'happiness', 'relief'] 
neutral_sentiments = ['neutral']

# sentiments = {'negative': negative_sentiments, 'positive': positive_sentiments, 'neutral': neutral_sentiments}

def grouping_sentiments(col):
    for item in negative_sentiments:
        col = col.replace(item, -1)
    
    for item in positive_sentiments:
        col = col.replace(item, 1)
        
    return col

df_reddit['label'] = grouping_sentiments(df_reddit['sentiment'])
df_reddit.head(5)

CPU times: user 23.7 ms, sys: 3.77 ms, total: 27.4 ms
Wall time: 25.5 ms


Unnamed: 0,tweet_id,sentiment,author,content,label
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...,-1
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...,-1
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!,1
5,1956968477,worry,xxxPEACHESxxx,Re-pinging @ghostridah14: why didn't you go to...,-1
6,1956968487,sadness,ShansBee,"I should be sleep, but im not! thinking about ...",-1


In [12]:
# Renaming the content features to text
df_reddit = df_reddit.rename(columns={'content':'text'})
df_reddit

Unnamed: 0,tweet_id,sentiment,author,text,label
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...,-1
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...,-1
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!,1
5,1956968477,worry,xxxPEACHESxxx,Re-pinging @ghostridah14: why didn't you go to...,-1
6,1956968487,sadness,ShansBee,"I should be sleep, but im not! thinking about ...",-1
...,...,...,...,...,...
39994,1753918900,happiness,courtside101,Succesfully following Tayla!!,1
39996,1753919001,love,drapeaux,Happy Mothers Day All my love,1
39997,1753919005,love,JenniRox,Happy Mother's Day to all the mommies out ther...,1
39998,1753919043,happiness,ipdaman1,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...,1


### Slicing Last two columns, Text and Label

In [13]:
%%time 
df = df_reddit.iloc[:,-2:]
df

CPU times: user 1.27 ms, sys: 79 µs, total: 1.35 ms
Wall time: 1.28 ms


Unnamed: 0,text,label
1,Layin n bed with a headache ughhhh...waitin o...,-1
2,Funeral ceremony...gloomy friday...,-1
3,wants to hang out with friends SOON!,1
5,Re-pinging @ghostridah14: why didn't you go to...,-1
6,"I should be sleep, but im not! thinking about ...",-1
...,...,...
39994,Succesfully following Tayla!!,1
39996,Happy Mothers Day All my love,1
39997,Happy Mother's Day to all the mommies out ther...,1
39998,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...,1


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30535 entries, 1 to 39999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    30535 non-null  object
 1   label   30535 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 715.7+ KB


In [16]:
df.label.value_counts()

 1    15299
-1    15236
Name: label, dtype: int64

In [15]:
df[1:].to_csv('formatted_data/reddit_comment.csv')