# Subtitle Sentiment Analysis

Using CoreNLP and a StanfordNLP server to perform sentiment analysis

In [7]:
from nltk import tokenize
import pandas as pd
import numpy as np
import pickle
import copy
from pycorenlp import StanfordCoreNLP
import logging
import json
from datetime import time

from os import listdir, environ
import pickle

from subtitle_parsing import denoiseSubtitleSpecificProcessing,subtitlePreprocess
import string

from google.cloud import translate_v2 as translate

In [8]:
environ["GOOGLE_APPLICATION_CREDENTIALS"]="Smell Of Fear-f8e3e23fa7d3.json"

### Helper Functions

In [3]:
#rawSub - subtitles, runtime is the effective runtime 
def divideSubsIntoSegments(lines, runtime):
    
    refEndSec = 30
    refEndMin = 0
    refEndHour = 0
    referenceEndTime = time(refEndHour, refEndMin, refEndSec)
    refStartSec = 0
    refStartMin = 0
    refStartHour = 0
    referenceStartTime = time(refStartHour, refStartMin, refStartSec)
    startTime = None
    
    subtitleIntervals = list()
    
    for segment in range(0, runtime):
        #parse of timing information
        subtitleSegment = list()
        for rowIndex in range(0,len(lines)):
            line = str(lines[rowIndex])
            if len(line) > 15:
                arrow = line[13] + line[14] + line[15]
                if arrow == '-->':
                    #timing information detected
                    #parse the actual time
                    startTime = line[0:8] #extract the start time
                    startHour = int(startTime[0] + startTime[1])
                    startMinutes = int(startTime[3] + startTime[4])
                    startSeconds = int(startTime[6] + startTime[7])
                    startTime = time(startHour,startMinutes,startSeconds)
                    
                    if startTime > referenceEndTime:
                        
                        subtitleIntervals.append(subtitleSegment)
                        
                        refStartSec = refStartSec + 30
                        if refStartSec == 60:
                            refStartSec = 0
                            refStartMin = refStartMin + 1
                        if refStartMin == 60:
                            refStartMin = 0
                            refStartHour = refStartHour + 1
                        
                        refEndSec = refEndSec + 30
                        if refEndSec == 60:
                            refEndMin = refEndMin + 1
                            refEndSec = 0
                        if refEndMin == 60:
                            refEndHour = refEndHour + 1
                            refEndMin = 0
                            
                        referenceEndTime = time(refEndHour, refEndMin, refEndSec)
                        referenceStartTime = time(refStartHour, refStartMin, refStartSec)
                        
                        break
                        
                    continue
                    
            if startTime != None:
                if startTime >= referenceStartTime and startTime <= referenceEndTime:
                    subtitleSegment.append(line)   

    return subtitleIntervals            

In [19]:
#subtitle translation
def translation_processing(translatedSubs):
    
    #remove all unecessary characters from the translated text
    #loop through each segement and each dialog from within each segment
    engTranslated = list()
    for i in range(0, len(translatedSubs)):
        dialog=translatedSubs[i]
        #parse the dialog for weird html characters
        k = 0
        while True:
            try:
                if dialog[k] == '&':
                    index = dialog.index(';')
                    #split off special ascii character
                    asciiChr = dialog[k:index+1]
                    if asciiChr == '&#39;':
                        dialog = dialog[:k] + '\'' + dialog[index+1:]
                    elif asciiChr == '&quot;':
                        dialog = dialog[:k] + '\"' + dialog[index+1:]
                k = k + 1
            except:
                break

        engTranslated.append(dialog)

    return engTranslated

### Parsing

In [20]:
#import srt files
movieRuntimePath = 'data/mounted/Numerical Data/movieRuntimes.csv'
movieRuntimeDf = pd.read_csv(movieRuntimePath, usecols = ['movie', 'effective runtime'])
movieList = list(movieRuntimeDf['movie'])

try:
    rawSubtitles = dict()
    for movie in movieList:
        subPath = 'data/mounted/Features/Subtitles SRT/' + movie + '.srt'
        subs = open(subPath, mode = 'r', encoding='utf-8-sig')
        rawSubtitles[movie] = subs
except FileNotFoundError:
    pass

In [None]:
#preprocess subtitles
movieSubtitles = dict()
for movie in movieList:
    
    #read in subtitles
    lines = rawSubtitles[movie].readlines() #contains each line within the document
    movieIndex = movieList.index(movie)
    #divide into 30 seconds segment
    segmentList = divideSubsIntoSegments(lines, movieRuntimeDf['effective runtime'][movieIndex])
    
    #denoise and normalise the subtitles
    parsedSegments = denoiseSubtitleSpecificProcessing(segmentList)
    
    #translate subtitles if required to
    if movie == 'Buddy' or movie == 'Suck Me Shakespeer' or movie == 'Help, I Shrunk My Teacher':
        #movie has german subtitles and thus must be translated segment by segment 
        translate_client = translate.Client()
        translatedSegments = list() 
        for segment in parsedSegments:
            # Translates some text from german to english
            translation = translate_client.translate(segment, target_language='en')
            translatedSegments.append(translation['translatedText'])
        
        #process the translation and remove any weird chars
        parsedSegments = translation_processing(translatedSegments)
 
    #process subtitles
    processedSegments = subtitlePreprocess(parsedSegments)
    
    
    passages = list()
    #convert lemmized segments to sentences again
    for segment in processedSegments:
        sentence = str()
        for word in segment:
            sentence +=  " " + word
        
        passages.append(sentence.strip()) 
    
    movieSubtitles[movie] = passages

In [41]:
for movie in movieList:
    print(movie)
    print(movieSubtitles[movie])
    print('\n')

Hobbit 2
['', '', '', 'die thirst come come thank kindly', 'oh watch sorry darling thank master stadle', 'mind join introduce name gandalf gandalf grey', 'gandalf grey know well fine chance bring thorin oakenshield bree receive word father see wander wilds near dunland go look find sign thorin long time since anything rumor hear', 'thrain still live sure father come see go miss say urge march upon erebor rally seven armies dwarves destroy dragon take back lonely mountain would say take back homeland', 'chance meet gandalf lonely mountain trouble thorin dragon sit long enough sooner later darker mind turn towards erebor run unsavory character whilst travel greenway mistake vagabond', 'imagine regret one carry message black speech promise payment head someone want dead thorin wait longer heir throne durin unite armies dwarves', 'together might power retake erebor summon meet seven dwarf families demand stand oath seven armies swear oath one wield king jewel arkenstone thing unite case fo

# Run NLP Server

Navigate to the directory that contains StanfordNLP then run the following code on Terminal:

## Run the server in English
java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "sentiment" -port 9000 -timeout 30000

## Run the server in German for Buddy, Suck me Shakespeer and Help, I Shrunk
java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "sentiment" -port 9000 -timeout 30000

### Note
May need to change the port number if the port is in use





In [None]:
#For movies with english subtitle scripts
subtitleSentiment = dict()

nlp = StanfordCoreNLP('http://localhost:9000')

for movie in movieList:
    if movie != 'Buddy' and movie != 'Suck Me Shakespeer' and movie != 'Help, I Shrunk My Teacher':
        try:
            subtitleList  = movieSubtitles[movie]
            sentimentMovie = list()
            for segment in subtitleList:
                sentimentSegment = dict()
                sentimentSegmentList = list()
                for line in segment:
                    res = nlp.annotate(line, properties={'annotators': 'sentiment','outputFormat': 'json','timeout': 30000,})
                    sentimentSegment['sentiment'] = res['sentences'][0]['sentiment']
                    sentimentSegment['sentimentValue'] = res['sentences'][0]['sentimentValue']
                    sentimentSegmentList.append(sentimentSegment)
                sentimentMovie.append(sentimentSegmentList)
            subtitleSentiment[movie] = sentimentMovie
        except KeyError:
            pass

In [49]:
pickle.dump(subtitleSentiment, open('subtitleSentiment.p', 'wb'))

In [None]:
subtitleSentiment['Hobbit 2']

In [None]:
subtitleSentiment['Buddy']