# Subtitle Sentiment Analysis

Using CoreNLP and a StanfordNLP server to perform sentiment analysis

In [2]:
from nltk import tokenize
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pickle
import copy
from pycorenlp import StanfordCoreNLP
import logging
import json
from datetime import time

In [3]:
#import srt files
movieRuntimePath = 'Numerical Data//movie_runtimes.csv'
movieRuntimeDf = pd.read_csv(movieRuntimePath, usecols = ['movie', 'effective runtime'])
movieList = list(movieRuntimeDf['movie'])

try:
    rawSubtitles = dict()
    for movie in movieList:
        subPath = 'Features//Subtitle SRT//' + movie + '.srt'
        subs = open(subPath, mode = 'r', encoding='utf-8-sig')
        rawSubtitles[movie] = subs
except FileNotFoundError:
    pass

In [4]:
#rawSub - subtitles, runtime is the effective runtime 
def divideSubsIntoSegments(lines, runtime):
    
    refEndSec = 30
    refEndMin = 0
    refEndHour = 0
    referenceEndTime = time(refEndHour, refEndMin, refEndSec)
    refStartSec = 0
    refStartMin = 0
    refStartHour = 0
    referenceStartTime = time(refStartHour, refStartMin, refStartSec)
    startTime = None
    
    subtitleIntervals = list()
    
    for segment in range(0, runtime):
        #parse of timing information
        subtitleSegment = list()
        for rowIndex in range(0,len(lines)):
            line = str(lines[rowIndex])
            if len(line) > 15:
                arrow = line[13] + line[14] + line[15]
                if arrow == '-->':
                    #timing information detected
                    #parse the actual time
                    startTime = line[0:8] #extract the start time
                    startHour = int(startTime[0] + startTime[1])
                    startMinutes = int(startTime[3] + startTime[4])
                    startSeconds = int(startTime[6] + startTime[7])
                    startTime = time(startHour,startMinutes,startSeconds)
                    
                    if startTime > referenceEndTime:
                        
                        subtitleIntervals.append(subtitleSegment)
                        
                        refStartSec = refStartSec + 30
                        if refStartSec == 60:
                            refStartSec = 0
                            refStartMin = refStartMin + 1
                        if refStartMin == 60:
                            refStartMin = 0
                            refStartHour = refStartHour + 1
                        
                        refEndSec = refEndSec + 30
                        if refEndSec == 60:
                            refEndMin = refEndMin + 1
                            refEndSec = 0
                        if refEndMin == 60:
                            refEndHour = refEndHour + 1
                            refEndMin = 0
                            
                        referenceEndTime = time(refEndHour, refEndMin, refEndSec)
                        referenceStartTime = time(refStartHour, refStartMin, refStartSec)
                        
                        break
                        
                    continue
                    
            if startTime != None:
                if startTime >= referenceStartTime and startTime <= referenceEndTime:
                    subtitleSegment.append(line)   

    return subtitleIntervals            

In [5]:
#remove any uncessary lines and unecessary characters within dialog lines
def editSubtitleData(subtitleIntervals):
    
    parsedSubtitleIntervals = list()
    htmlFlag = False
    
    for index in range(0,len(subtitleIntervals)):
        subtitleSegment = subtitleIntervals[index]
        modifiedSegment = list()
        if len(subtitleSegment) != 0:
            #parse the segment for any uncessary characters line by line
            for rowIndex in range(0, len(subtitleSegment)):
                line = subtitleSegment[rowIndex]
                parsedLine = str()
                for char in line:
                    #if the character is not a digit then continue to process
                    if not(char.isdigit()) and char != '\n':
                        #remove all html elements e.g. <i>, <b>
                        if char == '<': 
                            htmlFlag = True
                        if char == '>':
                            htmlFlag = False
                        if not(htmlFlag) and char != '>' and char != '\'':
                            parsedLine = parsedLine + char
                if len(parsedLine) != 0:
                    modifiedSegment.append(parsedLine)        
            parsedSubtitleIntervals.append(modifiedSegment)
        else:
            parsedSubtitleIntervals.append(subtitleSegment)
        
    return parsedSubtitleIntervals

In [6]:
movieSubtitles = dict()
for movie in movieList:
    try:
        lines = rawSubtitles[movie].readlines() #contains each line within the document
        movieIndex = movieList.index(movie)
        segmentList = divideSubsIntoSegments(lines, movieRuntimeDf['effective runtime'][movieIndex])
        modifiedIntervals= editSubtitleData(segmentList)
        movieSubtitles[movie] = modifiedIntervals
    except KeyError:
        pass

In [14]:
#save untranslated Buddy
pickle.dump(movieSubtitles['Buddy'], open('BuddySubtitles.p', 'wb'))

In [7]:
#buddy was translated locally, load buddy pickle object
buddySubtitlePath = 'Pickle Objects/buddyEngTranslated.p'
buddySubtitle = pickle.load(open(buddySubtitlePath, "rb" ))
movieSubtitles['Buddy'] = buddySubtitle

# Run NLP Server

Navigate to the directory that contains StanfordNLP then run the following code on Terminal:

## Run the server in English
java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "sentiment" -port 9000 -timeout 30000

### Note
May need to change the port number if the port is in use





In [None]:
#For movies with english subtitle scripts
subtitleSentiment = dict()

nlp = StanfordCoreNLP('http://localhost:9000')

for movie in movieList:
    if movie != 'Buddy':
        try:
            subtitleList  = movieSubtitles[movie]
            sentimentMovie = list()
            for segment in subtitleList:
                sentimentSegment = dict()
                sentimentSegmentList = list()
                for line in segment:
                    res = nlp.annotate(line, properties={'annotators': 'sentiment','outputFormat': 'json','timeout': 30000,})
                    sentimentSegment['sentiment'] = res['sentences'][0]['sentiment']
                    sentimentSegment['sentimentValue'] = res['sentences'][0]['sentimentValue']
                    sentimentSegmentList.append(sentimentSegment)
                sentimentMovie.append(sentimentSegmentList)
            subtitleSentiment[movie] = sentimentMovie
        except KeyError:
            pass

In [49]:
pickle.dump(subtitleSentiment, open('subtitleSentiment.p', 'wb'))

In [None]:
subtitleSentiment['Hobbit 2']

In [None]:
subtitleSentiment['Buddy']