In [1]:
import numpy as np
import pandas as pd
from math import trunc
import pickle
import copy
import random
import xgboost as xgb
from sklearn import metrics
from sklearn import model_selection
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import matplotlib.pyplot as plt
%matplotlib inline

### Helper Functions

In [37]:
#load and save the feature/label dataframes and save into a dictionary
def loadFeaturesLabels():
    
    #read all the feature-label csvs
    #read in files and sort into dictionary
    #each element of dict is a movie that contains a list of featurelabel screenings
    files = np.sort(os.listdir("disk/Features & Label Csvs/"))
    prevMovieName = files[1][:files[1].find("_")]
    screeningDict = dict()
    screenings = list()

    for file in files:

        if file == '.ipynb_checkpoints':
            continue 

        movieName = file[:file.find("_")]
        if movieName != prevMovieName:
            screeningDict[prevMovieName] = screenings
            screenings = list()
            prevMovieName = movieName
        else:
            infoDf = pd.read_csv("disk/Features & Label Csvs/" + file)
            infoDf.drop("Unnamed: 0", axis=1, inplace=True)
            infoDf.rename(columns={"0": "VOC"}, inplace=True)
            screenings.append(infoDf)
            prevMovieName = movieName
    
    screeningDict[movieName] = screenings
        
    return screeningDict


#singular features 
def assembleFeatureLabelDf(screeningDict, movieList):
    #assemble entire feature label dataframe 
    infoDf = pd.DataFrame([])
    for movie in movieList:
        #access movie 
        screeningList = screeningDict[movie]
        for screening in screeningList:
            infoDf = pd.concat([infoDf,screening], axis=0, ignore_index=True)
            
    return infoDf


#window size = 5 mins - 10 instances 
def createWindowedFeatures(screeningDict, movieList):
    #create header for the windowed dataframe
    #naming convention Red 1_2 
    #Red - colour Red
    #1 - position within 
    singleFeatureHeader = list(screeningDict['Buddy'][0].drop(["Delta", "VOC"], axis=1).columns)
    windowedFeatureHeader = list()
    for iteration in range(1,11):
        header = [label + "_" + str(iteration) for label in singleFeatureHeader]
        windowedFeatureHeader += header
        
    windowedFeatureHeader += ["Delta"]

    #create windowed features 
    for movie in movieList:

        for screeningIndex in range(0, len(screeningDict[movie])):

            screening = screeningDict[movie][screeningIndex]
            windowedScreening = pd.DataFrame([])
            #create windowed features
            for index in range(10, screening.shape[0]):

                window = screening[index-10:index].drop(["Delta", "VOC"], axis=1) #extract window
                window = window.values.ravel() #flattened window 

                #create windowed dataframe
                windowedScreening = pd.concat([windowedScreening,pd.DataFrame(np.expand_dims(window, axis=1).T)], axis=0) 
            
            #reindex the windowed screening
            windowedScreening.index = range(0, windowedScreening.shape[0])
            #add the delta field 
            delta = pd.DataFrame(np.expand_dims(screening["Delta"][10:].values, axis=1), columns=['Delta'])
            windowedScreening = pd.concat([windowedScreening, delta], axis=1, ignore_index=True)   

            #add header to dataframe
            windowedScreening.columns = windowedFeatureHeader
            #assign screening to dict
            screeningDict[movie][screeningIndex] = windowedScreening

    return screeningDict

def dropDuplicateRows(screeningDict):
    
    #create indices to check repeated rows
    movieList = list(screeningDict.keys())

    for movie in movieList:

        for index in range(0, len(screeningDict[movie])):

            screening = screeningDict[movie][index]
            
            #drop duplicates
            screening = screening.drop_duplicates()
            #reindex rows
            screening.index = range(0, screening.shape[0])

            #construct the list 
            screeningDict[movie][index] = screening
            
    return screeningDict

def calculateDelta(screeningDict):
    
    movieList = list(screeningDict.keys())
    
    for movie in movieList:
        
        for screeningIndex in range(0, len(screeningDict[movie])):
            
            screening = screeningDict[movie][screeningIndex]
    
            vocLabels = screening['VOC']
            deltaList = list()

            delta = None

            for index in range(1, vocLabels.shape[0]): 
                delta = vocLabels.loc[index] - vocLabels.loc[index-1]
                deltaList.append(delta)

            
            #add delta field to dataframe and then assign screening to overall dict
            screening.drop(0, axis=0, inplace=True) #drop the first row 
            screening.index = range(0, screening.shape[0]) #reindex so the join is proper
            screening = pd.concat([pd.DataFrame(deltaList, columns=['Delta']),screening], axis=1)
            screeningDict[movie][screeningIndex] = screening


    return screeningDict

def scaleDeltas(screeningDict):
    
    #scale delta between 0 and 1
    movieList = list(screeningDict.keys())

    for movie in movieList:
        for screeningIndex in range(0, len(screeningDict[movie])):
            #access screening
            screening = screeningDict[movie][screeningIndex]

            #drop na rows
            screening.dropna(inplace=True)

            #scale
            sc = MinMaxScaler()
            delta = sc.fit_transform(screening['Delta'].values.reshape(-1,1))
            screeningDict[movie][screeningIndex]['Delta'] = delta #assign scaled voc 
            
    return screeningDict

### Main Code

In [38]:
#load the screenings
screeningDict = loadFeaturesLabels()

In [39]:
#drop all duplicated rows 
screeningDict = dropDuplicateRows(screeningDict)

In [40]:
#calculate deltas
screeningDict = calculateDelta(screeningDict)

In [41]:
#scale deltas
screeningDict = scaleDeltas(screeningDict)

In [42]:
#create windowed features
screeningDict = createWindowedFeatures(screeningDict, list(screeningDict.keys()))

In [None]:
screeningDict['Buddy'][1]

In [214]:
R2_score = list()
RMSE_score = list()
testingMovies = list()
Random_R2_score = list()
Random_RMSE_score = list()

movieRuntimesPath = 'Numerical Data/movie_runtimes.csv'
movieRuntimeDf = pd.read_csv(movieRuntimesPath, usecols = ['movie', 'runtime (mins)', 'effective runtime'])
movieList = list(movieRuntimeDf['movie'])

for movie in movieList: 
    
    #train test split 
    trainingMovies = list(movieRuntimeDf['movie'])
    testMovie = [movie]
    trainingMovies.pop(trainingMovies.index(testMovie[0]))
    
    #assemble the training and test feature label dataframes
    trainingDf = assembleSingularFeatureLabelDf(screeningDict,trainingMovies)
    testingDf = assembleSingularFeatureLabelDf(screeningDict,testMovie)
    
    featureColName = "Delta"
    
    #NORMAL
    #split into labels and features 
    trainingLabels = trainingDf[featureColName].values
    trainingFeatures = trainingDf.drop(featureColName, axis=1).values
    testingLabels = testingDf[featureColName].values
    testingFeatures = testingDf.drop(featureColName, axis=1).values
    #normal 
    regressor = xgb.XGBRegressor(n_estimators=1000, n_jobs=-1)
    regressor.fit(trainingFeatures, trainingLabels.ravel())
    #predict
    predictions = regressor.predict(testingFeatures)
    r2_score = metrics.r2_score(testingLabels, predictions)
    rmse = np.sqrt(metrics.mean_squared_error(testingLabels,predictions))
    #print and save
    print("R2 score:", r2_score)
    print("RMSE: ", rmse)
    R2_score.append(r2_score)
    RMSE_score.append(rmse)
    testingMovies.append(movie)
    
    #RANDOM
    #randomize
    
#     randomTrainingVOC = copy.deepcopy(trainingDf[featureColName])
#     np.random.shuffle(randomTrainingVOC)
#     randomTrainingDf = trainingDf.drop(featureColName, axis=1)
#     randomTrainingDf = pd.concat([randomTrainingDf,randomTrainingVOC], axis=1)
    
#     randomTestingVOC = copy.deepcopy(testingDf[featureColName])
#     np.random.shuffle(randomTestingVOC)
#     randomTestingDf = testingDf.drop(featureColName, axis=1)
#     randomTestingDf = pd.concat([randomTestingDf,randomTestingVOC], axis=1)
    
#     #split into labels and features 
#     trainingLabels = randomTrainingDf[featureColName].values
#     trainingFeatures = randomTrainingDf.drop(featureColName, axis=1).values
#     testingLabels = randomTestingDf[featureColName].values
#     testingFeatures = randomTestingDf.drop(featureColName, axis=1).values
    
#     #normal 
#     regressor = xgb.XGBRegressor(n_estimators=1000, n_jobs=-1)
#     regressor.fit(trainingFeatures, trainingLabels.ravel())
#     #predict
#     predictions = regressor.predict(testingFeatures)
#     r2_score = metrics.r2_score(testingLabels, predictions)
#     rmse = np.sqrt(metrics.mean_squared_error(testingLabels,predictions))
#     #print and save
#     print("Random R2 score:", r2_score)
#     print("Random RMSE: ", rmse)
#     Random_R2_score.append(r2_score)
#     Random_RMSE_score.append(rmse)

    

KeyError: 'Delta'

In [89]:
#write to dataframe
pd.DataFrame({'RMSE':RMSE_score, 
              'R2 Score':R2_score, 
              'Random RMSE': Random_RMSE_score,
              'Random R2 Score':Random_R2_score,
              'Test Movie': testingMovies}).to_csv("XGBoost Movie and AR Features Randomisation Results.csv")