In [74]:
import pandas as pd
import numpy as np

import pickle

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics


import copy

In [75]:
#frames were collected at 1/3fps so for a 30 second period there are 10 frames. This function just groups the 
#dominant frame colour or shade components to within their respective intervals
def grouping(visualList):
    movieVisuals = list()
    for index in range(0, int(len(visualList)/10)):
        segment = visualList[index*10:index*10+10]
        movieVisuals.append(segment)
    return movieVisuals

In [76]:
def processVisuals(movieVisualData, runtime, isColour):
    visualDataIntervals = grouping(movieVisualData)
    #the visual data also has the credits accounted for so remove them
    visualDataIntervals = visualDataIntervals[:runtime]
    #create a dataframe 
    if isColour: 
        #create a dominant colour dataframe
        framesPerInterval = 10
        header = list();
        for i in range(1,framesPerInterval+1):
            header = header + ['R'+str(i), 'G' + str(i),  'B'+str(i)]
    else: #shade object to be parsed
        framesPerInterval = 10
        header = ['S' + str(x) for x in range(1,framesPerInterval+1)]
    
    visualDf = pd.DataFrame(columns=header)
    #assemble the dataframe
    for segment in visualDataIntervals:
        index = visualDataIntervals.index(segment)
        colourRow = list()
        for colour in segment:
            if isColour:
                colourRow = colourRow + [colour[0], colour[1], colour[2]]
            else:
                colourRow = colourRow + [colour[0]]
        #assign that colour row to the dataframe
        visualDf.loc[index] = colourRow
    
        
            
    return visualDf

In [77]:
def processAudio(runtime, audio):
    audioFeatures = list(audio.keys())

    audioDf = pd.DataFrame(columns=[])        
    for key in audioFeatures:
        audio[key] = audio[key][:runtime]

        #assemble df 
        #create header
        if key != 'tempo':
            header = [key + str(x) for x in range(1, len(audio[key][0])+1)]
        else:
            header = ['tempo']

        audioFeatureDf = pd.DataFrame(columns=header)
        for index in range(0, len(audio[key])):
            feature = audio[key][index]
            audioFeatureDf.loc[index] = np.float16(feature)

        #concatenate featureDf to audioDf
        audioDf = pd.concat([audioDf,audioFeatureDf], axis=1)
    
    return audioDf

In [84]:
def processSubtitles(subs, effectiveRuntime):
    
    header = ['sentiment value']
    subSentimentDf = pd.DataFrame(columns=header)
    for sentimentIndex in range(0, len(subs)):
        sentiment = subs[sentimentIndex]
        if len(sentiment) != 0:
            if sentiment['sentimentValue'] == np.NaN:
                print('YES')
            else:         
                subSentimentDf.loc[sentimentIndex] = [sentiment['sentimentValue']]
        else:
            subSentimentDf.loc[sentimentIndex] = [-1] #indicates no dialog occurred during the scene
        
        if len(subSentimentDf) != effectiveRuntime:
            #no dialog at the end thus need to fill the rest with -1
            for index in range(0, effectiveRuntime-len(subSentimentDf)+1):
                 subSentimentDf.loc[index] = [-1]
    
    return subSentimentDf

In [None]:
#import vocs
vocDict = pickle.load(open("Pickle Objects/normalisedScreeningsDict.p", "rb" ))
vocDictWindow = pickle.load(open("Pickle Objects/normalisedWindowedScreeningsDict.p", "rb" ))

#import movie runtimes
movieRuntimesPath = 'Numerical Data/movie_runtimes.csv'
movieRuntimeDf = pd.read_csv(movieRuntimesPath, usecols = ['movie', 'runtime (mins)', 'effective runtime'])
movieList = list(movieRuntimeDf['movie'])

movieFeatureDict = dict() #dict contains the movie film features with the keys being the movies
#import pickle objects for movies and then assemble the dataframes  
for movie in movieList:
    try:
        #load pickle feauture objects
        featurePath = 'Pickle Objects/Audio Feature Pickle Objects/' + movie + '.p'
        audio = pickle.load(open(featurePath, "rb" )) 
        featurePath = 'Pickle Objects/Colour Pickle Objects/' + movie + '.p'
        colour = pickle.load(open(featurePath, "rb" )) 
        featurePath = 'Pickle Objects/Shade Pickle Objects/' + movie + '.p'
        shade = pickle.load(open(featurePath, "rb" )) 
        featurePath = 'Pickle Objects/Subtitle Sentiment Pickle Objects/' + movie + '.p'
        sentiment = pickle.load(open(featurePath, "rb" )) 

        runtime = movieRuntimeDf.loc[movieList.index(movie)]['effective runtime']
        colourDf = processVisuals(colour, runtime, True)
        shadeDf = processVisuals(shade, runtime, False)
        audioDf = processAudio(runtime, audio)
        sentimentDf = processSubtitles(sentiment,runtime)

        inputDf = pd.concat([colourDf,shadeDf,audioDf,sentimentDf], axis = 1)
        movieFeatureDict[movie] = inputDf
    except FileNotFoundError:
        print(movie)


In [None]:
#overall feature and labels df
featureDf = pd.DataFrame([]) #film feature dataframe
labelDf = pd.DataFrame([]) #voc dataframe

#user macros
windowedVOCs = False
lengthOfWindow = 10

In [None]:
#remove all screenings of im off then and help i shrunk the teacher as at the current time do not have the movies
screenings = list()
matchedMovies = list()
for movieIndex in range(0, len(vocDict['matchedMovies'])):
    movie = vocDict['matchedMovies'][movieIndex]
    if movie != "Help, I Shrunk My Teacher" and movie != "I'm Off Then":
        #add good screenings to a modified screening list
        matchedMovies.append(movie)
        screenings.append(vocDict['screenings'][movieIndex])
#replace
vocDict = dict()
vocDict['matchedMovies'] = matchedMovies
vocDict['screenings'] = screenings

In [None]:
movieFeatureDict['Star Wars-The Force Awakens']['sentiment value']

In [55]:
#create label and feature df
for i in range(0, len(vocDict['screenings'])): 
    matchedMovie = vocDict['matchedMovies'][i]
    featureDf = pd.concat([featureDf, movieFeatureDict[matchedMovie]])
    if not(windowedVOCs):
        screening = vocDict['screenings'][i]
        labelDf = pd.concat([labelDf, screening['CO2']])
    else:
        screening = vocDictWindow['screenings'][i]
        #using windowedVOCsed VOCs
        header = ['VOC' + str(x) for x in range(1,lengthOfWindow+1)]
        vocWindowDf = pd.DataFrame(columns = header)
        for index in range(0, len(screening)):
            vocWindow = screening[index]['CO2'].values
            vocWindowDf.loc[index] = vocWindow 
        labelDf = pd.concat([labelDf, vocWindowDf])

#relabel column title 
if not(windowedVOCs):
    labelDf.columns = ['VOC']

In [56]:
#create training and test datasets
featuresTrain, featuresTest, labelsTrain, labelsTest = train_test_split(featureDf, labelDf, test_size= 0.20) #80 20 train test split

In [57]:
featuresTrain['sentiment value']

165      2
171      2
260      2
133      2
51      -1
207      3
109      2
251    NaN
14       2
216    NaN
33       1
135      3
156      3
207      1
166      1
142     -1
33       1
53       1
117     -1
227      2
47       1
165      3
52       2
129     -1
49       3
34       2
159      2
32       2
63       1
12       2
      ... 
203      2
22       2
110      2
217      2
131     -1
65       1
208     -1
102      2
100      2
71       2
265      3
8        1
162      2
162     -1
239      3
176      2
34       2
121      2
28       3
24      -1
190      1
32       2
124      1
252    NaN
230     -1
236      2
60       3
182      3
205      2
21       2
Name: sentiment value, Length: 16766, dtype: object

In [None]:
regressor = RandomForestRegressor(n_estimators=10000, random_state=0)
regressor.fit(featuresTrain, labelsTrain)
labelsPred = regressor.predict(featuresTest)

In [20]:
print('Mean Absolute Error:', metrics.mean_absolute_error(labels_test, labels_pred))
print('Mean Squared Error:', metrics.mean_squared_error(labels_test, labels_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(labels_test, labels_pred)))

Mean Absolute Error: 301.590556257022
Mean Squared Error: 138759.64619962405
Root Mean Squared Error: 372.5045586293194
