### SGD Implementation Example

Test the new data then train on that new data

In [9]:
import numpy as np
from sklearn import linear_model
from sklearn import metrics

In [10]:
n_samples, n_features = 10, 5
np.random.seed(0)
y = np.random.randn(n_samples)
X = np.random.randn(n_samples, n_features)

In [30]:
clf = linear_model.SGDRegressor()
rmseList = list()
for instanceIndex in range(0, len(X)):
    instance = X[instanceIndex].reshape(1, -1)
    label = y[instanceIndex].reshape(-1, 1)
    if instanceIndex == 0:
        #train the model
        clf.fit(instance,label)
    else:
        #test the model
        predicted = clf.predict(instance)
        #get rmse
        rmse = np.sqrt(metrics.mean_squared_error(label, predicted))
        rmseList.append(rmse)

  y = column_or_1d(y, warn=True)


In [31]:
rmseList

[0.22059874602133628,
 0.7705616994897968,
 2.138493820098493,
 1.834517235547889,
 1.211007783645852,
 1.1151303563431092,
 0.1128506255106105,
 0.16181025119858136,
 0.3347571060085341]

### SGD Implementation

In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
import DataPipeline
from sklearn import linear_model
import pickle
from math import trunc
import copy

In [2]:
def createRandomisedVOCScreenings(vocRandomised, runtimeList, movieList ,matchedMovies):
    startIndex = 0
    screeningList = list()
    for movie in matchedMovies:
        try:
            runtime = runtimeList[movieList.index(movie)]
        except ValueError:
            continue
        endIndex = startIndex + runtime
        screening = vocRandomised[startIndex:endIndex]
        screeningList.append(screening)
        startIndex = endIndex
    return screeningList

In [3]:
def generateVOCScreenings(vocDf2013,vocDf2015, sliceDf, matchedMovies):
    screeningList = list()
    prevStartIndex = 0
    startIndex = 0
    vocDf = vocDf2013
    for index in range(0, len(matchedMovies)):
        
        startIndex = sliceDf.loc[index]['start']
        endIndex = sliceDf.loc[index]['end']
        if startIndex == 371: #the 2015 df starts at this index
            vocDf = vocDf2015
        screening = pd.DataFrame(vocDf.iloc[startIndex:endIndex+1,0])
        screeningList.append(screening)
        
        prevStartIndex = startIndex
        
    return screeningList

In [4]:
def normalisation(vocScreenings, voc):
    normalisedVOCList = list()
    for screening in vocScreenings:
        normalisedVOCFrame = copy.deepcopy(screening)
        normalisedVOCFrame = normalisedVOCFrame.values/max(screening.values)
        normalisedVOCFrame = normalisedVOCFrame.flatten()
        normalisedScreening= pd.DataFrame.from_dict({voc:normalisedVOCFrame})
        normalisedVOCList.append(normalisedScreening)
    return normalisedVOCList

In [5]:
#some vocs have NaN measurements during the decided screening times. Ignore these screenings
#also remove empty screenings
def removeNaNScreenings(screenings, randomisedScreenings, matchedMovies):
    screeningList = list()
    randomScreeningList = list()
    movieList = list()
    for screeningIndex in range(0, len(screenings)):
        if not(np.isnan(screenings[screeningIndex].values).any()) and len(screenings[screeningIndex].values) != 0:
            screeningList.append(screenings[screeningIndex])
            randomScreeningList.append(randomisedScreenings[screeningIndex])
            movieList.append(matchedMovies[screeningIndex])
    return screeningList,randomScreeningList,movieList

In [6]:
#column header matching issue between 2013 and 2015 
#e.g. in 2015 column is m356.0711 vs in 2013 is it m356.0714
#assumption being made is that they are the same column so round to 2dp and match
def vocRounding(vocDf):
    vocList = list()
    for index in range(0, len(vocDf.columns)):
        if vocDf.columns[index] == 'Time' or vocDf.columns[index] == 'ocs' or vocDf.columns[index] == 'co' or vocDf.columns[index] == 'CO2':
            vocList.append(vocDf.columns[index])    
        else:
            #string slice to get the molar mass
            voc = vocDf.columns[index]
            mass = (trunc(float(voc[1:])*1000))/1000 #TRUNCATE TO 3DP
            vocList.append(mass)
    return vocList

In [7]:
#frames were collected at 1/3fps so for a 30 second period there are 10 frames. This function just groups the 
#dominant frame colour or shade components to within their respective intervals
def grouping(visualList):
    movieVisuals = list()
    for index in range(0, int(len(visualList)/10)):
        segment = visualList[index*10:index*10+10]
        movieVisuals.append(segment)
    return movieVisuals

In [8]:
def processVisuals(movieVisualData, runtime, isColour):
    visualDataIntervals = grouping(movieVisualData)
    #the visual data also has the credits accounted for so remove them
    visualDataIntervals = visualDataIntervals[:runtime]
    #create a dataframe 
    if isColour: 
        #create a dominant colour dataframe
        framesPerInterval = 10
        header = list();
        for i in range(1,framesPerInterval+1):
            header = header + ['R'+str(i), 'G' + str(i),  'B'+str(i)]
    else: #shade object to be parsed
        framesPerInterval = 10
        header = ['S' + str(x) for x in range(1,framesPerInterval+1)]
    
    visualDf = pd.DataFrame(columns=header)
    #assemble the dataframe
    for segment in visualDataIntervals:
        index = visualDataIntervals.index(segment)
        colourRow = list()
        for colour in segment:
            if isColour:
                colourRow = colourRow + [colour[0], colour[1], colour[2]]
            else:
                colourRow = colourRow + [colour[0]]
        #assign that colour row to the dataframe
        visualDf.loc[index] = colourRow
            
    return visualDf

In [9]:
def processAudio(runtime, audio):
    audioFeatures = list(audio.keys())

    audioDf = pd.DataFrame(columns=[])        
    for key in audioFeatures:
        audio[key] = audio[key][:runtime]

        #assemble df 
        #create header
        if key != 'tempo':
            header = [key + str(x) for x in range(1, len(audio[key][0])+1)]
        else:
            header = ['tempo']

        audioFeatureDf = pd.DataFrame(columns=header)
        for index in range(0, len(audio[key])):
            feature = audio[key][index]
            audioFeatureDf.loc[index] = feature

        #concatenate featureDf to audioDf
        audioDf = pd.concat([audioDf,audioFeatureDf], axis=1)
    
    return audioDf

In [10]:
def processSubtitles(subs, effectiveRuntime):
    
    header = ['sentiment value']
    subSentimentDf = pd.DataFrame(columns=header)
    for sentimentIndex in range(0, len(subs)):
        sentiment = subs[sentimentIndex]
        if len(sentiment) != 0:
            if sentiment['sentimentValue'] == np.NaN:
                print('YES')
            else:         
                subSentimentDf.loc[sentimentIndex] = [sentiment['sentimentValue']]
        else:
            subSentimentDf.loc[sentimentIndex] = [-1] #indicates no dialog occurred during the scene
        
        #enforce no dialog until the credit scene if there is in fact no dialog
        if len(subSentimentDf) != effectiveRuntime:
            #no dialog at the end thus need to fill the rest with -1
            for index in range(0, effectiveRuntime-len(subSentimentDf)+1):
                 subSentimentDf.loc[index] = [-1]
    
    return subSentimentDf

In [11]:
def processASL(asl, effectiveRuntime):
    
    header = ['average shot length']
    aslDf = pd.DataFrame(columns=header)
    for index in range(0, effectiveRuntime): 
        aslValue = asl[index]
        aslDf.loc[index] = aslValue
    
    return aslDf

In [12]:
def removeMovies(vocDict):

    #remove all screenings of im off then and help i shrunk the teacher as at the current time do not have the movies
    screenings = list()
    matchedMovies = list()
    for movieIndex in range(0, len(vocDict['matchedMovies'])):
        movie = vocDict['matchedMovies'][movieIndex]
        if movie != "Help, I Shrunk My Teacher" and movie != "I'm Off Then":
            #add good screenings to a modified screening list
            matchedMovies.append(movie)
            screenings.append(vocDict['screenings'][movieIndex])
    #replace
    vocDict = dict()
    vocDict['matchedMovies'] = matchedMovies
    vocDict['screenings'] = screenings
    
    return vocDict

In [13]:
#user macros
featureDf = pd.DataFrame([]) #film feature dataframe
labelDf = pd.DataFrame([]) #voc dataframe
deltaVOCs = False
windowedVOCs = False
lengthOfWindow = 10

In [14]:
#read in the various csvs
#2013 Dataset
vocPath = 'Numerical Data/2013VOCData.csv'
voc2013DfAll = pd.read_csv(vocPath, header = 0, nrows = 74208, low_memory=False)
movieScreeningsPath = 'Numerical Data/screening_times.csv'
movingScreeningsDf = pd.read_csv(movieScreeningsPath, usecols = ['scheduled','movie','filled %'])
movieRuntimesPath = 'Numerical Data/movie_runtimes.csv'
movieRuntimeDf = pd.read_csv(movieRuntimesPath, usecols = ['movie', 'runtime (mins)', 'effective runtime'])
#2015 Dataset
starWarsPath = 'Numerical Data/Star Wars-The Force Awakens.csv'
starWarsScreeningDf = pd.read_csv(starWarsPath)
imOffThenPath = 'Numerical Data/I\'m Off Then.csv'
imOffThenScreeningDf = pd.read_csv(imOffThenPath)
helpIShrunkTheTeacherPath = 'Numerical Data/Help, I Shrunk My Teacher.csv'
helpIShrunkTheTeacherScreeningDf = pd.read_csv(helpIShrunkTheTeacherPath)
vocPath = 'Numerical Data/2015VOCData.csv'
voc2015DfAll = pd.read_csv(vocPath)
#remove first column of 2015 voc df as its not used
voc2015DfAll.drop("Unnamed: 0", axis=1, inplace=True)

#import co2Slice pickle objects
slicePath = 'Pickle Objects/CO2SliceDict.p'
sliceDict = pickle.load(open(slicePath, "rb" )) #contains df of co2 slice indices and matched movie list

In [15]:
#overall feature and labels df
featureDf = pd.DataFrame([]) #film feature dataframe
labelDf = pd.DataFrame([]) #voc dataframe

#import movie runtimes
movieRuntimesPath = 'Numerical Data/movie_runtimes.csv'
movieRuntimeDf = pd.read_csv(movieRuntimesPath, usecols = ['movie', 'runtime (mins)', 'effective runtime'])
movieList = list(movieRuntimeDf['movie'])

movieFeatureDict = dict() #dict contains the movie film features with the keys being the movies
#import pickle objects for movies and then assemble the dataframes  
for movie in movieList:
    try:
        #load pickle feauture objects
        featurePath = 'Pickle Objects/Audio Feature Pickle Objects/' + movie + '.p'
        audio = pickle.load(open(featurePath, "rb" )) 
        featurePath = 'Pickle Objects/Colour Pickle Objects/' + movie + '.p'
        colour = pickle.load(open(featurePath, "rb" )) 
        featurePath = 'Pickle Objects/Shade Pickle Objects/' + movie + '.p'
        shade = pickle.load(open(featurePath, "rb" )) 
        featurePath = 'Pickle Objects/Subtitle Sentiment Pickle Objects/' + movie + '.p'
        sentiment = pickle.load(open(featurePath, "rb" )) 
        featurePath = 'Pickle Objects/ASL Pickle Objects/' + movie + '.p'
        asl = pickle.load(open(featurePath, "rb" )) 

        runtime = int(movieRuntimeDf.loc[movieList.index(movie)]['effective runtime'])
        colourDf = processVisuals(colour, runtime, True)
        shadeDf = processVisuals(shade, runtime, False)
        audioDf = processAudio(runtime, audio)
        sentimentDf = processSubtitles(sentiment,runtime)
        aslDf = processASL(asl, runtime)

        inputDf = pd.concat([colourDf,shadeDf,audioDf,sentimentDf,aslDf], axis = 1)
        movieFeatureDict[movie] = inputDf

    except FileNotFoundError:
        pass

In [16]:
voc2015Col = vocRounding(voc2015DfAll)
voc2013Col = vocRounding(voc2013DfAll)
voc2013Df = copy.deepcopy(voc2013DfAll)
voc2015Df = copy.deepcopy(voc2015DfAll)
voc2013Df.columns = voc2013Col
voc2015Df.columns = voc2015Col

vocUseList = list()

for vocIndex in range(0, len(voc2015Df.columns)):
    voc = voc2015Df.columns[vocIndex]
    if voc == 'Time':
        continue
    else:
        try:
            indexMask = list(voc2013Df.columns).index(voc)
        except ValueError: #the voc isnt within the 2013 VOC dataset
            continue 
            
        print(voc)
        #create normal voc screening list
        vocDf2013 = voc2013Df.iloc[:,[indexMask]]
        vocDf2015 = voc2015Df.iloc[:,[vocIndex]]   
        
        #generate screenings
        screeningList = generateVOCScreenings(vocDf2013,vocDf2015, sliceDict['sliceDf'], sliceDict['matchedMovies'])
        matchedMovies = copy.deepcopy(sliceDict['matchedMovies'])
        #remove normal screenings with NaN values in the screenings
        screeningList, randomisedScreeningList, matchedMovies = removeNaNScreenings(screeningList, screeningList, matchedMovies)
        #normalise both screenings 
        screeningList = normalisation(screeningList, voc)
        #create randomised and unrandomised list
        vocDict = {'screenings':screeningList, 'matchedMovies':matchedMovies}
        #remove all screenings of im off then and help i shrunk the teacher as at the current time do not have the movies
        vocDict = removeMovies(vocDict)
        
        #create overall label and feature df
        for i in range(0, len(vocDict['screenings'])): 

            matchedMovie = vocDict['matchedMovies'][i]

            if not(deltaVOCs):
                featureDf = pd.concat([featureDf, movieFeatureDict[matchedMovie]])

            if not(windowedVOCs):
                screening = vocDict['screenings'][i]
                labelDf = pd.concat([labelDf, screening['CO2']])

        #relabel column title 
        if not(windowedVOCs):
            labelDf.columns = ['VOC']

        
        break

CO2


In [17]:
print(featureDf.shape[0])
print(labelDf.shape)

21085
(21085, 1)


In [None]:
clf = linear_model.SGDRegressor()
rmseList = list()
for instanceIndex in range(0, labelDf.shape[0]):
    instance = (featureDf.iloc[instanceIndex].values).reshape(1, -1)
    label = (labelDf.iloc[instanceIndex].values).reshape(-1, 1)
    if instanceIndex == 0:
        #train the model
        clf.fit(instance,label)
    else:
        #test the model
        predicted = clf.predict(instance)
        print(predicted)
        #get rmse
        rmse = np.sqrt(metrics.mean_squared_error(label, predicted))
        rmseList.append(rmse)

In [94]:
resultsDf = pd.DataFrame(rmseList,columns=['RMSE'])
resultsPath = str(voc) + '_SGD.csv'
resultsDf.to_csv(resultsPath, sep=',', encoding='utf-8')