# **Importing Python Packages**

In [1]:
# Import necessary packages 
%matplotlib inline
import pandas as pd
import numpy as np
import re
import nltk
from textblob import TextBlob
nltk.download('words')
words = set(nltk.corpus.words.words())
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

stop_words = set(stopwords.words('english'))

# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package words to
[nltk_data]     /Users/fuchunyang/nltk_data...
[nltk_data]   Package words is already up-to-date!


# **Importing Data**

In [2]:
# import the data saved from the r/health subreddit 
df = pd.read_csv('data1.csv') 
df.head()

Unnamed: 0.1,Unnamed: 0,subreddit,title,selftext,upvote_ratio,ups,downs,score,total_awards_received,num_comments,name
0,0,Health,9 million children to be vaccinated against po...,,0.97,227,0,227,1,9,t3_tm4duo
1,1,Health,Male birth control pill 99 per cent effective ...,,0.77,9,0,9,0,1,t3_tmvnpp
2,2,Health,FDA: Unsanitary Conditions Found at Baby Food ...,,1.0,8,0,8,0,1,t3_tmqs4n
3,3,Health,California governor signs law that makes abort...,,0.95,791,0,791,0,20,t3_tkwola
4,4,Health,Federal judge sides with 12 disabled kids seek...,,0.88,6,0,6,0,0,t3_tmvkjk


In [3]:
# Dimension of the dataset
df.shape

(6473, 11)

# **Removing unnecessary columns**

In [4]:
# Remove the first column that shows "subreddit"
n_df = df.drop(columns = ['subreddit'], axis = 1)
n_df = n_df.drop(n_df.columns[0], axis=1) 
n_df.head()

Unnamed: 0,title,selftext,upvote_ratio,ups,downs,score,total_awards_received,num_comments,name
0,9 million children to be vaccinated against po...,,0.97,227,0,227,1,9,t3_tm4duo
1,Male birth control pill 99 per cent effective ...,,0.77,9,0,9,0,1,t3_tmvnpp
2,FDA: Unsanitary Conditions Found at Baby Food ...,,1.0,8,0,8,0,1,t3_tmqs4n
3,California governor signs law that makes abort...,,0.95,791,0,791,0,20,t3_tkwola
4,Federal judge sides with 12 disabled kids seek...,,0.88,6,0,6,0,0,t3_tmvkjk


# **Data Pre-Processing: Removal of symbols, URLS, etc.**

In [5]:
# remove the hashtags 
n_df['title'] = n_df['title'].str.replace("[^a-zA-Z#]", " ")
n_df['title'].head()

0      million children to be vaccinated against po...
1    Male birth control pill    per cent effective ...
2    FDA  Unsanitary Conditions Found at Baby Food ...
3    California governor signs law that makes abort...
4    Federal judge sides with    disabled kids seek...
Name: title, dtype: object

In [6]:
# title text cleaning; remove several uncessary things such as @, http links, hashtags, etc. This will prepare the data to be ready for sentiment analysis
def clean_title(temp):
    temp = str(temp).lower()
    temp = re.sub("'", "", str(temp)) # to avoid removing contractions in english
    temp = re.sub("@[A-Za-z0-9_]+","", str(temp)) # Removing @mentions
    temp = re.sub("#[A-Za-z0-9_]+","", str(temp)) # Removing # hashtags
    temp = re.sub(r"www.\S+", "", temp) # Removing weblinks
    temp = re.sub(r"http\S+", "", temp)  # Removing http weblinks
    temp = re.sub('[()!?]', ' ', temp) # Removing special symbols
    temp = re.sub('\[.*?\]',' ', temp)
    temp = re.sub("[^a-z0-9]"," ", temp)
    temp = temp.split()
    stopwords = stop_words
    temp = [w for w in temp if not w in stopwords]
    temp = " ".join(word for word in temp)
    return temp

n_df['title'] = n_df['title'].map(lambda x: clean_title(x))
n_df['selftext'] = n_df['selftext'].map(lambda x: clean_title(x))
n_df['title_selftext'] = n_df['title'].astype(str) + "  " + n_df['selftext']
n_df.head()

Unnamed: 0,title,selftext,upvote_ratio,ups,downs,score,total_awards_received,num_comments,name,title_selftext
0,million children vaccinated polio africa,,0.97,227,0,227,1,9,t3_tm4duo,million children vaccinated polio africa nan
1,male birth control pill per cent effective mic...,,0.77,9,0,9,0,1,t3_tmvnpp,male birth control pill per cent effective mic...
2,fda unsanitary conditions found baby food factory,,1.0,8,0,8,0,1,t3_tmqs4n,fda unsanitary conditions found baby food fact...
3,california governor signs law makes abortions ...,,0.95,791,0,791,0,20,t3_tkwola,california governor signs law makes abortions ...
4,federal judge sides disabled kids seeking mask...,,0.88,6,0,6,0,0,t3_tmvkjk,federal judge sides disabled kids seeking mask...


In [7]:
# remove short words less than three letters 
n_df['title'] = n_df['title'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
n_df['selftext'] = n_df['selftext'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
n_df['title_selftext'] = n_df['title_selftext'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# **Tokenization**

In [8]:
# separate the whole title sentence into different words so we can do the sentiment analysis about each word 
n_df['tokenized_titles'] = n_df['title'].apply(lambda x: x.split())
n_df['tokenized_titles'].head()

0       [million, children, vaccinated, polio, africa]
1    [male, birth, control, pill, cent, effective, ...
2    [unsanitary, conditions, found, baby, food, fa...
3    [california, governor, signs, makes, abortions...
4    [federal, judge, sides, disabled, kids, seekin...
Name: tokenized_titles, dtype: object

In [9]:
# separate the whole selftext sentence into different words so we can do the sentiment analysis about each word 
n_df['tokenized_selftext'] = n_df['selftext'].apply(lambda x: x.split())
n_df['tokenized_selftext'].head()

0    []
1    []
2    []
3    []
4    []
Name: tokenized_selftext, dtype: object

In [10]:
# separate the combination of title and selftext into different words so we can do the sentiment analysis about each word 
n_df['tokenized_title_selftext'] = n_df['title_selftext'].apply(lambda x: x.split())
n_df['tokenized_title_selftext'].head()

0       [million, children, vaccinated, polio, africa]
1    [male, birth, control, pill, cent, effective, ...
2    [unsanitary, conditions, found, baby, food, fa...
3    [california, governor, signs, makes, abortions...
4    [federal, judge, sides, disabled, kids, seekin...
Name: tokenized_title_selftext, dtype: object

# **Stemming**

In [11]:
# Implementing the stemming procedure; Stemming is a rule-based process of stripping the suffixes (“ing”, “ly”, “es”, “s” etc) from a word. 
# For example “play”, “player”, “played”, “plays” and “playing” are the different variations of the word – “play”.
stemmer = PorterStemmer()
tokenized_titles = n_df['tokenized_titles'].apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
n_df['tokenized_titles']= tokenized_titles
n_df.head()

Unnamed: 0,title,selftext,upvote_ratio,ups,downs,score,total_awards_received,num_comments,name,title_selftext,tokenized_titles,tokenized_selftext,tokenized_title_selftext
0,million children vaccinated polio africa,,0.97,227,0,227,1,9,t3_tm4duo,million children vaccinated polio africa,"[million, children, vaccin, polio, africa]",[],"[million, children, vaccinated, polio, africa]"
1,male birth control pill cent effective mice tr...,,0.77,9,0,9,0,1,t3_tmvnpp,male birth control pill cent effective mice tr...,"[male, birth, control, pill, cent, effect, mic...",[],"[male, birth, control, pill, cent, effective, ..."
2,unsanitary conditions found baby food factory,,1.0,8,0,8,0,1,t3_tmqs4n,unsanitary conditions found baby food factory,"[unsanitari, condit, found, babi, food, factori]",[],"[unsanitary, conditions, found, baby, food, fa..."
3,california governor signs makes abortions cheaper,,0.95,791,0,791,0,20,t3_tkwola,california governor signs makes abortions cheaper,"[california, governor, sign, make, abort, chea...",[],"[california, governor, signs, makes, abortions..."
4,federal judge sides disabled kids seeking mask...,,0.88,6,0,6,0,0,t3_tmvkjk,federal judge sides disabled kids seeking mask...,"[feder, judg, side, disabl, kid, seek, mask, s...",[],"[federal, judge, sides, disabled, kids, seekin..."


In [12]:
# Implementing the stemming procedure for selftext 
stemmer = PorterStemmer()
tokenized_selftext = n_df['tokenized_selftext'].apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
n_df['tokenized_selftext']= tokenized_selftext
n_df.head()

Unnamed: 0,title,selftext,upvote_ratio,ups,downs,score,total_awards_received,num_comments,name,title_selftext,tokenized_titles,tokenized_selftext,tokenized_title_selftext
0,million children vaccinated polio africa,,0.97,227,0,227,1,9,t3_tm4duo,million children vaccinated polio africa,"[million, children, vaccin, polio, africa]",[],"[million, children, vaccinated, polio, africa]"
1,male birth control pill cent effective mice tr...,,0.77,9,0,9,0,1,t3_tmvnpp,male birth control pill cent effective mice tr...,"[male, birth, control, pill, cent, effect, mic...",[],"[male, birth, control, pill, cent, effective, ..."
2,unsanitary conditions found baby food factory,,1.0,8,0,8,0,1,t3_tmqs4n,unsanitary conditions found baby food factory,"[unsanitari, condit, found, babi, food, factori]",[],"[unsanitary, conditions, found, baby, food, fa..."
3,california governor signs makes abortions cheaper,,0.95,791,0,791,0,20,t3_tkwola,california governor signs makes abortions cheaper,"[california, governor, sign, make, abort, chea...",[],"[california, governor, signs, makes, abortions..."
4,federal judge sides disabled kids seeking mask...,,0.88,6,0,6,0,0,t3_tmvkjk,federal judge sides disabled kids seeking mask...,"[feder, judg, side, disabl, kid, seek, mask, s...",[],"[federal, judge, sides, disabled, kids, seekin..."


In [13]:
# Implementing the stemming procedure for the combination of title and selftext 
stemmer = PorterStemmer()
tokenized_title_selftext = n_df['tokenized_title_selftext'].apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
n_df['tokenized_title_selftext']= tokenized_title_selftext
n_df.head()

Unnamed: 0,title,selftext,upvote_ratio,ups,downs,score,total_awards_received,num_comments,name,title_selftext,tokenized_titles,tokenized_selftext,tokenized_title_selftext
0,million children vaccinated polio africa,,0.97,227,0,227,1,9,t3_tm4duo,million children vaccinated polio africa,"[million, children, vaccin, polio, africa]",[],"[million, children, vaccin, polio, africa]"
1,male birth control pill cent effective mice tr...,,0.77,9,0,9,0,1,t3_tmvnpp,male birth control pill cent effective mice tr...,"[male, birth, control, pill, cent, effect, mic...",[],"[male, birth, control, pill, cent, effect, mic..."
2,unsanitary conditions found baby food factory,,1.0,8,0,8,0,1,t3_tmqs4n,unsanitary conditions found baby food factory,"[unsanitari, condit, found, babi, food, factori]",[],"[unsanitari, condit, found, babi, food, factori]"
3,california governor signs makes abortions cheaper,,0.95,791,0,791,0,20,t3_tkwola,california governor signs makes abortions cheaper,"[california, governor, sign, make, abort, chea...",[],"[california, governor, sign, make, abort, chea..."
4,federal judge sides disabled kids seeking mask...,,0.88,6,0,6,0,0,t3_tmvkjk,federal judge sides disabled kids seeking mask...,"[feder, judg, side, disabl, kid, seek, mask, s...",[],"[feder, judg, side, disabl, kid, seek, mask, s..."


# **Text Classification using TextBlob**


The sentiment function of textblob returns two properties, polarity, and subjectivity. Polarity is float which lies in the range of [-1,1] where 1 means positive statement and -1 means a negative statement.


Subjective sentences generally refer to personal opinion, emotion or judgment whereas objective refers to factual information. Subjectivity is also a float which lies in the range of [0,1].

In [14]:
# Creating columns with the result of the sentiment text analysis for title 
n_df['polarity_t'] = n_df['title'].apply(lambda x: TextBlob(x).sentiment.polarity)
n_df['subjectivity_t'] = n_df['title'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
n_df[['title', 'polarity_t', 'subjectivity_t']].head()

Unnamed: 0,title,polarity_t,subjectivity_t
0,million children vaccinated polio africa,0.0,0.0
1,male birth control pill cent effective mice tr...,0.3,0.45
2,unsanitary conditions found baby food factory,0.0,0.0
3,california governor signs makes abortions cheaper,0.0,0.0
4,federal judge sides disabled kids seeking mask...,-0.2,0.3


In [15]:
# Creating columns with the result of the sentiment text analysis for selftext  
n_df['polarity_s'] = n_df['selftext'].apply(lambda x: TextBlob(x).sentiment.polarity)
n_df['subjectivity_s'] = n_df['selftext'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
n_df[['selftext', 'polarity_s', 'subjectivity_s']].head()

Unnamed: 0,selftext,polarity_s,subjectivity_s
0,,0.0,0.0
1,,0.0,0.0
2,,0.0,0.0
3,,0.0,0.0
4,,0.0,0.0


In [16]:
# Creating columns with the result of the sentiment text analysis for combination of title and selftext 
n_df['polarity_ts'] = n_df['title_selftext'].apply(lambda x: TextBlob(x).sentiment.polarity)
n_df['subjectivity_ts'] = n_df['title_selftext'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
n_df[['title_selftext', 'polarity_ts', 'subjectivity_ts']].head()

Unnamed: 0,title_selftext,polarity_ts,subjectivity_ts
0,million children vaccinated polio africa,0.0,0.0
1,male birth control pill cent effective mice tr...,0.3,0.45
2,unsanitary conditions found baby food factory,0.0,0.0
3,california governor signs makes abortions cheaper,0.0,0.0
4,federal judge sides disabled kids seeking mask...,-0.2,0.3


# **Sentiment Analysis using Polarity**


In [17]:
# Utility function to classify the polarity of the title of a post using textblob.
def analyze_sentiment(title):
    '''
    Utility function to classify the polarity of a Reddit post
    using textblob.
    '''
    analysis = TextBlob(clean_title(title))
    if analysis.sentiment.polarity > 0:
        return 1 # Positive
    elif analysis.sentiment.polarity == 0:
        return 0 # Neutral
    else:
        return -1 # Negative

In [18]:
# Utility function to classify the polarity of the selftext of a post using textblob.
def analyze_sentiment(selftext):
    '''
    Utility function to classify the polarity of a Reddit post
    using textblob.
    '''
    analysis = TextBlob(clean_title(selftext))
    if analysis.sentiment.polarity > 0:
        return 1 # Positive
    elif analysis.sentiment.polarity == 0:
        return 0 # Neutral
    else:
        return -1 # Negative

In [19]:
# Utility function to classify the polarity of the combination of selftext and title of a post using textblob.
def analyze_sentiment(title_selftext):
    '''
    Utility function to classify the polarity of a Reddit post
    using textblob.
    '''
    analysis = TextBlob(clean_title(title_selftext))
    if analysis.sentiment.polarity > 0:
        return 1 # Positive
    elif analysis.sentiment.polarity == 0:
        return 0 # Neutral
    else:
        return -1 # Negative

In [20]:
# Creating a sentiment column with the result of the analysis for title 
n_df['sentiment_t'] = np.array([ analyze_sentiment(title) for title in n_df['title'] ])
n_df[['title', 'sentiment_t']].head()

Unnamed: 0,title,sentiment_t
0,million children vaccinated polio africa,0
1,male birth control pill cent effective mice tr...,1
2,unsanitary conditions found baby food factory,0
3,california governor signs makes abortions cheaper,0
4,federal judge sides disabled kids seeking mask...,-1


In [21]:
# Creating a sentiment column with the result of the analysis for selftext 
n_df['sentiment_s'] = np.array([ analyze_sentiment(title) for title in n_df['selftext'] ])
n_df[['selftext', 'sentiment_s']].head()

Unnamed: 0,selftext,sentiment_s
0,,0
1,,0
2,,0
3,,0
4,,0


In [22]:
# Creating a sentiment column with the result of the analysis for the combination of title and selftext 
n_df['sentiment_ts'] = np.array([ analyze_sentiment(title) for title in n_df['title_selftext'] ])
n_df[['title_selftext', 'sentiment_ts']].head()

Unnamed: 0,title_selftext,sentiment_ts
0,million children vaccinated polio africa,0
1,male birth control pill cent effective mice tr...,1
2,unsanitary conditions found baby food factory,0
3,california governor signs makes abortions cheaper,0
4,federal judge sides disabled kids seeking mask...,-1


In [23]:
# Converting the values into string texts
n_df['sentiment_title'] = n_df.sentiment_t.map({1: "Positive", 0:"Neutral", -1:"Negative"})
n_df[['title', 'sentiment_t', 'sentiment_title']].head()

Unnamed: 0,title,sentiment_t,sentiment_title
0,million children vaccinated polio africa,0,Neutral
1,male birth control pill cent effective mice tr...,1,Positive
2,unsanitary conditions found baby food factory,0,Neutral
3,california governor signs makes abortions cheaper,0,Neutral
4,federal judge sides disabled kids seeking mask...,-1,Negative


In [24]:
# Converting the values into string texts
n_df['sentiment_selftext'] = n_df.sentiment_s.map({1: "Positive", 0:"Neutral", -1:"Negative"})
n_df[['selftext', 'sentiment_s', 'sentiment_selftext']].head()

Unnamed: 0,selftext,sentiment_s,sentiment_selftext
0,,0,Neutral
1,,0,Neutral
2,,0,Neutral
3,,0,Neutral
4,,0,Neutral


In [25]:
# Converting the values into string texts
n_df['sentiment_title_selftext'] = n_df.sentiment_ts.map({1: "Positive", 0:"Neutral", -1:"Negative"})
n_df[['title_selftext', 'sentiment_ts', 'sentiment_title_selftext']].head()

Unnamed: 0,title_selftext,sentiment_ts,sentiment_title_selftext
0,million children vaccinated polio africa,0,Neutral
1,male birth control pill cent effective mice tr...,1,Positive
2,unsanitary conditions found baby food factory,0,Neutral
3,california governor signs makes abortions cheaper,0,Neutral
4,federal judge sides disabled kids seeking mask...,-1,Negative


# **Export CSV**


In [26]:
# Exporting the results into PDF
n_df.to_csv('./sentiment_data.csv', index=True, header=True)