In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import numpy as np
import pandas as pd
import os
os.environ["OBJC_DISABLE_INITIALIZE_FORK_SAFETY"] = "YES"


In [33]:
# Build connection
spark = SparkSession.builder. \
    master("local[*]"). \
    appName("PySpark"). \
    config("spark.driver.memory","16g"). \
    config("spark.driver.maxResultSize", "4g"). \
    getOrCreate()

In [34]:
# Load Dataset
data_path = 'data/convincing_data.csv'
data = spark.read.option("header",True).csv(data_path).rdd

In [18]:
data.take(1)

                                                                                

[Row(type='comment', id='imlcpab', subreddit.id='2qh1i', subreddit.name='askreddit', subreddit.nsfw='False', created_utc='1661990065', permalink='https://old.reddit.com/r/AskReddit/comments/x2fj3g/whats_a_controversial_topic_no_one_wants_to/imlcpab/', sentiment='0.469', score='2', body_cleaned="['need', 'chang', 'law', 'worth', 'sell', 'agricultur', 'product', 'us', 'rather', 'export', 'also', 'need', 'chang', 'law', 'monetari', 'penalti', 'grow', 'crop', 'particular', 'viabl', 'area', 'natur', 'climat', 'stand', 'right', 'neighbor', 'make', 'doubl', 'price', 'per', 'head', 'cattl', 'export', 'countri', 'would', 'sell', 'right', 'peopl', 'complain', 'climat', 'chang', 'probabl', 'complain']", climate_count='2', change_count='3', body_length='403', climate_proportion='0.004962779156327543', change_proportion='0.007444168734491315')]

In [19]:
def score_label(score):
    if score>0: return 1
    elif score<0: return -1
    else: return 0

In [35]:
id_label_content = data.map(lambda x: (x['id'], score_label(float(x['sentiment'])), x['body_cleaned']))

In [8]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def evaluation(y, y_pred):
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred, average='weighted')
    recall = recall_score(y, y_pred, average='weighted')
    f1 = f1_score(y, y_pred, average='weighted')
    return accuracy, precision, recall, f1

### AFINN lexicon

In [36]:
# !pip install afinn
from afinn import Afinn
afinn = Afinn()
id_label_score_afinn = id_label_content.map(lambda x: (x[0], x[1], afinn.score(x[2])))   
id_true_pred_afinn = id_label_score_afinn.map(lambda x: (x[0], x[1], score_label(x[2])))

In [37]:
%%time
res_afinn = pd.DataFrame(id_true_pred_afinn.collect())
print('Accuracy, Precision, Recall, F1:', evaluation(res_afinn.iloc[:,1],res_afinn.iloc[:,2]))

                                                                                

Accuracy, Precision, Recall, F1: (0.7044006446364192, 0.7524181936953306, 0.7044006446364192, 0.7237871614428109)
CPU times: user 846 ms, sys: 157 ms, total: 1 s
Wall time: 4min 26s


### VADER lexicon

In [38]:
# !pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
id_label_score_vader = id_label_content.map(lambda x: (x[0], x[1], analyzer.polarity_scores(x[2])['compound']))  
id_true_pred_vader = id_label_score_vader.map(lambda x: (x[0], x[1], score_label(x[2])))


In [39]:
%%time
res_vader = pd.DataFrame(id_true_pred_vader.collect())
print('Accuracy, Precision, Recall, F1:', evaluation(res_vader.iloc[:,1],res_vader.iloc[:,2]))

                                                                                

Accuracy, Precision, Recall, F1: (0.7603382226251592, 0.7711465291335221, 0.7603382226251592, 0.7628042003837002)
CPU times: user 834 ms, sys: 128 ms, total: 962 ms
Wall time: 2min 53s


### Hu and Liu Lexicon

In [23]:
positive_words = pd.read_csv('https://raw.githubusercontent.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107/master/data/opinion-lexicon-English/positive-words.txt',
                        names=['word'], comment=';', encoding='latin-1')['word'].tolist()
negative_words = pd.read_csv('https://raw.githubusercontent.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107/master/data/opinion-lexicon-English/negative-words.txt',
                        names=['word'], comment=';', encoding='latin-1')['word'].tolist()

def get_sentiment_huliu(words):
    # Define variables to keep track of the positive and negative scores
    pos_score = 0
    neg_score = 0
    
    # Loop through each word and check if it's in the positive or negative word list
    for word in words:
        if word in positive_words:
            pos_score += 1
        elif word in negative_words:
            neg_score += 1
    # print(pos_score,neg_score)
    # Calculate the sentiment score for the text
    if pos_score > neg_score:
        return 1
    elif pos_score < neg_score:
        return -1
    else:
        return 0


In [24]:
id_true_pred_huliu = id_label_content.map(lambda x: (x[0], x[1], get_sentiment_huliu(x[2].strip('[]').replace("'",'').split(', '))))


In [25]:
%%time
res_huliu = pd.DataFrame(id_true_pred_huliu.collect())
print('Accuracy, Precision, Recall, F1:', evaluation(res_huliu.iloc[:,1],res_huliu.iloc[:,2]))

                                                                                

Accuracy, Precision, Recall, F1: (0.614468788289387, 0.7086152519485013, 0.614468788289387, 0.6536469527999506)
CPU times: user 905 ms, sys: 113 ms, total: 1.02 s
Wall time: 11min 53s


In [27]:
spark.stop()

### SentiWordNet Lexicon

In [2]:
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.tokenize import word_tokenize

def get_sentiment_nltk(words):
    pos_score = 0
    neg_score = 0
    obj_score = 0
    for word in words:
        synsets = wn.synsets(word)
        if synsets:
            swn_synset = swn.senti_synset(synsets[0].name())
            pos_score += swn_synset.pos_score()
            neg_score += swn_synset.neg_score()

    # normalize the scores
    if pos_score > neg_score:
        return 1
    elif pos_score < neg_score:
        return -1
    else:
        return 0


In [4]:
data_path = 'data/convincing_data.csv'
data = pd.read_csv(data_path)
data = data[['id','sentiment','body_cleaned']]
data['body_cleaned'] = data['body_cleaned'].apply(lambda x: x.strip('[]').replace("'",'').split(', '))

In [6]:
%%time
data['sentiment_pred'] = data['body_cleaned'].apply(lambda x: get_sentiment_nltk(x))

CPU times: user 12min 54s, sys: 43.8 s, total: 13min 38s
Wall time: 1h 22min 39s


In [14]:
data['sentiment'] = data['sentiment'].apply(lambda x: score_label(x))

In [15]:
print('Accuracy, Precision, Recall, F1:', evaluation(data.iloc[:,1],data.iloc[:,3]))

Accuracy, Precision, Recall, F1: (0.5750066620889773, 0.5955107442904731, 0.5750066620889773, 0.5820972858699276)
