## Lexicon-based approach

In [1]:
import json
import pandas as pd
from nltk import word_tokenize
import string
import numpy as np
from nltk.corpus import sentiwordnet as swn
import nltk
from tqdm import tqdm
from os import listdir
from sklearn.metrics import classification_report

In [2]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/Bart/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
metadata_df = pd.read_csv("../podcast_data_no_audio/metadata/metadata.tsv",sep='\t')

## Load data

In [4]:
# get list of all paths to the json-files of english episodes given subset number (bart: 0 , juno: 1, joris: 2)

def get_paths_for_en_episodes(subset_number):
    """
    Function returns list of all paths to the json-files of english 
    episodes given subset number (bart: 0 , juno: 1, joris: 2)
    
    """

    path1 = '../podcast_data_no_audio/podcasts-transcripts/' + str(subset_number)

    folders = listdir(path1)

    if '.DS_Store' in folders:
        folders.remove('.DS_Store')

    podcast_episodes_paths = []

    for letter_or_number in tqdm(folders):    
        path2 = path1 + '/' + letter_or_number


        for show_uri in listdir(path2):
            path3 = path2 + '/' + show_uri

            # select english shows only
            show_metadata = metadata_df.loc[metadata_df['show_filename_prefix'] == show_uri]

            if len(show_metadata['language'].unique()) > 0:
                if 'en' in show_metadata['language'].unique()[0]:
                    for episode_uri in listdir(path3):
                        path4 = path3 + '/' + episode_uri

                        if '.json' in path4:
                            podcast_episodes_paths.append(path4)

                
        
    return len(podcast_episodes_paths), podcast_episodes_paths

In [5]:
get_paths_for_en_episodes(0)

100%|██████████| 36/36 [00:18<00:00,  2.08it/s]


(13169,
 ['../podcast_data_no_audio/podcasts-transcripts/0/R/show_0rEzHBbqtuqgP4zEmKzqIH/1vQEPaGpic4rvaVGf3B9P6.json',
  '../podcast_data_no_audio/podcasts-transcripts/0/R/show_0RQpGNWVQtBBwKP0TEPPyM/1RvpxteWceRd94M0jgY6kv.json',
  '../podcast_data_no_audio/podcasts-transcripts/0/R/show_0RQpGNWVQtBBwKP0TEPPyM/34qeqrulTtcZTkECBAdIPy.json',
  '../podcast_data_no_audio/podcasts-transcripts/0/R/show_0R6EeMT7ViHV1rCqNw9FNg/6Y077pNlcA4bIt311QV198.json',
  '../podcast_data_no_audio/podcasts-transcripts/0/R/show_0RFnIxovd7WWgpLXzzXrKF/4lgBQcROehM4nmmsOcRcOV.json',
  '../podcast_data_no_audio/podcasts-transcripts/0/R/show_0RFnIxovd7WWgpLXzzXrKF/7wSNsD65ducgtyMcblEOuP.json',
  '../podcast_data_no_audio/podcasts-transcripts/0/R/show_0RFnIxovd7WWgpLXzzXrKF/6fSvpbHfRKD8GaimKhlqb8.json',
  '../podcast_data_no_audio/podcasts-transcripts/0/R/show_0RFnIxovd7WWgpLXzzXrKF/0qSVjYk4YRh9nl61KRjLKj.json',
  '../podcast_data_no_audio/podcasts-transcripts/0/R/show_0RFnIxovd7WWgpLXzzXrKF/1hnWMZXjCPLBA0gaAI1AZU.

## NLTK SentiWordNet

In [6]:
def SentiWordNet_sentiment(utterance):
    """
    Returns sentiment score for a podcast utterance with tagged tokens 
    using SentiWordNet
    """
    
    # tokenize utterance
    lemmatizer = nltk.WordNetLemmatizer()
    tokens = nltk.word_tokenize(utterance)
    
    # POS tag utterance
    tagged_tokens = nltk.pos_tag(tokens)
    
    # assign sentiment score using SentiWordNet, including synonyms
    tokens_sentiment_scores = []
    for token in tagged_tokens:
        tag = ''
        lemma = lemmatizer.lemmatize(token[0])
        if token[1].startswith('N'):
            tag = 'n'
        elif token[1].startswith('J'):
            tag = 'a'
        elif token[1].startswith('V'):
            tag = 'v'
        elif token[1].startswith('R'):
            tag = 'r'
        if tag != '':
            # also get sentiments for synonyms
            synonyms = list(swn.senti_synsets(lemma, tag)) 
            token_sentiment = 0
            if len(synonyms) > 0:
                for synonym in synonyms:
                    token_sentiment += synonym.pos_score() - synonym.neg_score()
                tokens_sentiment_scores.append(token_sentiment/len(synonyms))      
   
    
    if tokens_sentiment_scores != []:
        sentiment_score = sum(tokens_sentiment_scores)/len(tokens_sentiment_scores) 

        if sentiment_score >= 0:
            return 1
        elif sentiment_score < 0:
            return 0

    else:   
        return 1


### Validation

In [7]:
# load binary validation dataset
val_df = pd.read_csv('../data/labeled_datasets/binary/binary_val.csv', sep='\t')
val_df.head(5)

len(val_df)

1292

In [8]:
# calculate metrics
target_labels = val_df['sentiment_score'].values
predicted_labels = []
for sample in tqdm(val_df['text']):
    predicted_sentiment = SentiWordNet_sentiment(sample)
    predicted_labels.append(predicted_sentiment)

predicted_labels = np.array(predicted_labels)  
print(classification_report(target_labels, predicted_labels))

100%|██████████| 1292/1292 [00:16<00:00, 77.36it/s] 

              precision    recall  f1-score   support

         0.0       0.48      0.42      0.45       405
         1.0       0.75      0.80      0.77       887

    accuracy                           0.68      1292
   macro avg       0.62      0.61      0.61      1292
weighted avg       0.67      0.68      0.67      1292






## VADER Sentiment

In [9]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [12]:
def VADER_sentiment_classifier(utterance):
    analyser = SentimentIntensityAnalyzer()

    score = analyser.polarity_scores(utterance)['compound']
  
    if score >= 0:
        return 1
    elif score < 0:
        return 0


### VADER validation

In [13]:
target_labels = val_df['sentiment_score'].values
predicted_labels = []
for sample in tqdm(val_df['text']):
    predicted_sentiment = VADER_sentiment_classifier(sample)
    predicted_labels.append(predicted_sentiment)

predicted_labels = np.array(predicted_labels)  
print(classification_report(target_labels, predicted_labels))

100%|██████████| 1292/1292 [00:16<00:00, 76.21it/s]

              precision    recall  f1-score   support

         0.0       0.68      0.26      0.38       405
         1.0       0.74      0.94      0.83       887

    accuracy                           0.73      1292
   macro avg       0.71      0.60      0.60      1292
weighted avg       0.72      0.73      0.69      1292




