In [1]:
import pandas as pd
import numpy as np
import pickle
import os

## Features

In [2]:
#import pickle objects for movies and then assemble the dataframes  
def generateFeatures(movieList, movieRuntimeDf):
    
    for movie in movieList:
        runtime = movieRuntimeDf['effective runtime'].loc[movieList.index(movie)]

        #load pickle feauture objects
        featurePath = 'data/mounted/Pickle Objects/Audio Feature Pickle Objects/' + movie + '.p'
        audio = pickle.load(open(featurePath, "rb" )) 
        featurePath = 'data/mounted/Pickle Objects/Colour Pickle Objects/' + movie + '.p'
        colour = pickle.load(open(featurePath, "rb" )) 
        featurePath = 'data/mounted/Pickle Objects/Shade Pickle Objects/' + movie + '.p'
        shade = pickle.load(open(featurePath, "rb" )) 
        featurePath = 'data/mounted/Pickle Objects/Subtitle Sentiment Pickle Objects/' + movie + '.p'
        sentiment = pickle.load(open(featurePath, "rb" )) 
        featurePath = 'data/mounted/Pickle Objects/ASL Pickle Objects/' + movie + '.p'
        asl = pickle.load(open(featurePath, "rb" )) 

        audioDf = processAudio(runtime, audio)
        redDf, greenDf, blueDf  = processVisuals(colour,movieRuntimeDf,movieList,movie)
        shadeDf = processShade(shade,movieRuntimeDf,movieList,movie)
        sentimentDf = processSubtitles(sentiment)
        aslDf = asl

        #combine features into a singular dataframe
        header = ['Red ' + str(num) for num in range(1, 31)]
        redDf = pd.DataFrame(redDf, columns=header)
        header = ['Green ' + str(num) for num in range(1, 31)]
        greenDf = pd.DataFrame(greenDf, columns=header)
        header = ['Blue ' + str(num) for num in range(1, 31)]
        blueDf = pd.DataFrame(blueDf, columns=header)
        header = ['Shade ' + str(num) for num in range(1,31)]
        shadeDf = pd.DataFrame(shadeDf, columns=header)
        aslDf = pd.DataFrame(aslDf, columns=['ASL'])
        sentimentDf = pd.DataFrame(sentimentDf, columns=['Sentiment'])

        featureDf = pd.concat([redDf,greenDf,blueDf,shadeDf,aslDf,sentimentDf,audioDf], axis=1).dropna()
        movieFeatureDict[movie] = featureDf
        
    return movieFeatureDict

In [3]:
def processAudio(runtime, audio):
    audioFeatures = list(audio.keys())

    audioDf = pd.DataFrame(columns=[])        
    for key in audioFeatures:
        audio[key] = audio[key][:runtime]

        #assemble df 
        #create header
        if key != 'tempo':
            header = [key + str(x) for x in range(1, len(audio[key][0])+1)]
        else:
            header = ['tempo']

        audioFeatureDf = pd.DataFrame(columns=header)
        for index in range(0, len(audio[key])):
            feature = audio[key][index]
            audioFeatureDf.loc[index] = feature

        #concatenate featureDf to audioDf
        audioDf = pd.concat([audioDf,audioFeatureDf], axis=1)
    
    return audioDf

def processSubtitles(sentiment):
    #sentiment processing 

    #extract sentiment value numbers
    #0 = no dialog
    #1 = negative
    #2 = neutral
    #3 = positive

    sentimentDf = list()
    for index in range(0, len(sentiment)):

        sentimentInstance = sentiment[index]

        if len(sentimentInstance)==0: #no dialog 
            sentimentDf.append(0)
        else:
            sentimentDf.append(int(sentimentInstance['sentimentValue']))
            
    return sentimentDf

def processVisuals(colour,movieRuntimeDf,movieList,movie):
    #colour processing 
    framesPerInterval = 30
    #split into RGB layers
    red = [colourValue[0] for colourValue in colour]
    green = [colourValue[1] for colourValue in colour]
    blue = [colourValue[2] for colourValue in colour]

    #group colour values as instances
    redDf = [red[framesPerInterval*index:framesPerInterval*(index+1)] for index in range(0, movieRuntimeDf.loc[movieList.index(movie)]['effective runtime'])]
    greenDf = [green[framesPerInterval*index:framesPerInterval*(index+1)] for index in range(0, movieRuntimeDf.loc[movieList.index(movie)]['effective runtime'])]
    blueDf = [blue[framesPerInterval*index:framesPerInterval*(index+1)] for index in range(0, movieRuntimeDf.loc[movieList.index(movie)]['effective runtime'])]
    
    return redDf,greenDf,blueDf

def processShade(shade,movieRuntimeDf,movieList,movie):
    #shade processing
    framesPerInterval = 30
    shade = [shadeValue[0] for shadeValue in shade] #extract shade values
    #group shade values as instances
    shadeDf = [shade[framesPerInterval*index:framesPerInterval*(index+1)] for index in range(0, movieRuntimeDf.loc[movieList.index(movie)]['effective runtime'])]
    
    return shadeDf

In [4]:
#window the features
def createWindows(df):
    window_size = 10
    windowed_df = list()
    for index in range(window_size, df.shape[0]):
        window = df[index-window_size:index]
        #flatten to get row 
        row = window.values.ravel()
        windowed_df.append(row)

    #convert list to dataframe
    windowed_df = pd.DataFrame(windowed_df)
    
    return windowed_df

### Features

In [5]:
#import movie runtimes
movieRuntimesPath = 'data/mounted/Numerical Data/movie_runtimes.csv'
movieRuntimeDf = pd.read_csv(movieRuntimesPath, usecols = ['movie', 'runtime (mins)', 'effective runtime'])
movieList = list(movieRuntimeDf['movie'])
movieFeatureDict = dict() #dict contains the movie film features with the keys being the movies

In [6]:
#import pickle objects for movies and then assemble the dataframes  
movieFeatureDict = generateFeatures(movieList, movieRuntimeDf)

### Single VOC output and connect to features

In [7]:
base_path = 'data/mounted/Processed VOC Screenings/'
os.listdir(base_path)

save_url = "data//mounted//Single Features & Label Csvs//"

In [16]:
#import all voc's and then connect vocs to features and output the films
voc_screenings = os.listdir(r'data/mounted/Processed VOC Screenings')
for screening_name in voc_screenings:
    #what movie is being processed
    movie = screening_name.split("-")[:-3]
    movie = "-".join(movie)
    
    features = movieFeatureDict[movie]
    
    #read in voc screenings
    screening = pd.read_csv("data//mounted//Processed VOC Screenings//" + screening_name)
    screening.drop('Unnamed: 0', axis=1, inplace=True)
    screening.drop('time', axis=1, inplace=True)
    
    #connect features to vocs
    instance_df = pd.concat([features,screening], axis=1)
    
    #write out the csv
    instance_df.to_csv(save_url+screening_name+".csv")
    
    print(screening_name)

Star Wars-The Force Awakens-26-12-2015 14:00
Star Wars-The Force Awakens-27-12-2015 14:00
Buddy-25-12-2013 23:10
Walter Mitty-02-01-2014 17:15
Star Wars-The Force Awakens-22-12-2015 14:00
Walter Mitty-03-01-2014 20:15
Star Wars-The Force Awakens-25-12-2015 22:30
I'm Off Then-26-12-2015 20:00
Buddy-28-12-2013 22:25
Star Wars-The Force Awakens-30-12-2015 22:30
I'm Off Then-30-12-2015 17:30
Star Wars-The Force Awakens-02-01-2016 14:00
I'm Off Then-30-12-2015 20:00
I'm Off Then-09-01-2016 17:30
Walter Mitty-03-01-2014 17:15
Walter Mitty-05-01-2014 17:15
Buddy-26-12-2013 22:25
I'm Off Then-27-12-2015 17:30
Star Wars-The Force Awakens-20-12-2015 14:00
I'm Off Then-27-12-2015 20:00
Hobbit 2-22-12-2013 20:10
The Hunger Games-Catching Fire-31-12-2013 13:15
Machete Kills-23-12-2013 22:30
Walter Mitty-06-01-2014 20:15
Star Wars-The Force Awakens-08-01-2016 14:00
Help, I Shrunk My Teacher-07-01-2016 11:30
I'm Off Then-07-01-2016 17:30
I'm Off Then-03-01-2016 20:00
I'm Off Then-02-01-2016 17:30
I'm

### Windowed Features and Labels

In [18]:
base_path = 'data/mounted/Processed VOC Screenings/'
os.listdir(base_path)

save_url = "data//mounted//Windowed Features & Label Csvs//"

In [None]:
#import all voc's and then connect vocs to features and output the films
voc_screenings = os.listdir(r'data/mounted/Processed VOC Screenings')
for screening_name in voc_screenings:
    #what movie is being processed
    movie = screening_name.split("-")[:-3]
    movie = "-".join(movie)
    
    features = movieFeatureDict[movie]
    
    #read in voc screenings
    screening = pd.read_csv("data//mounted//Processed VOC Screenings//" + screening_name)
    screening.drop('Unnamed: 0', axis=1, inplace=True)
    screening.drop('time', axis=1, inplace=True)
    
    #window features
    windowed_features_df = createWindows(features)
    
    #connect features to vocs
    instance_df = pd.concat([windowed_features_df,screening.loc[10:]], ignore_index=True, axis=1)
    
    header = ['Feature' + str(no) for no in range(0,windowed_features_df.shape[1])] + list(screening.columns)
    instance_df.columns = header
    
    #write out the csv
    instance_df.to_csv(save_url+screening_name+".csv")