<h2> 2013 and 2015 Randomisation </h2>

In [2]:
import pandas as pd
import numpy as np
import pickle
import random
import copy
from math import trunc
from RegressionRF import RegressionModel
np.seterr(divide='ignore', invalid='ignore')
import openpyxl
from sklearn import preprocessing

In [3]:
#read in the various csvs
#2013 Dataset
vocPath = 'Numerical Data/2013VOCData.csv'
voc2013DfAll = pd.read_csv(vocPath, header = 0, nrows = 74208, low_memory=False)
movieScreeningsPath = 'Numerical Data/screening_times.csv'
movingScreeningsDf = pd.read_csv(movieScreeningsPath, usecols = ['scheduled','movie','filled %'])
movieRuntimesPath = 'Numerical Data/movie_runtimes.csv'
movieRuntimeDf = pd.read_csv(movieRuntimesPath, usecols = ['movie', 'runtime (mins)', 'effective runtime'])
#2015 Dataset
starWarsPath = 'Numerical Data/Star Wars-The Force Awakens.csv'
starWarsScreeningDf = pd.read_csv(starWarsPath)
imOffThenPath = 'Numerical Data/I\'m Off Then.csv'
imOffThenScreeningDf = pd.read_csv(imOffThenPath)
helpIShrunkTheTeacherPath = 'Numerical Data/Help, I Shrunk My Teacher.csv'
helpIShrunkTheTeacherScreeningDf = pd.read_csv(helpIShrunkTheTeacherPath)
vocPath = 'Numerical Data/2015VOCData.csv'
voc2015DfAll = pd.read_csv(vocPath)
#remove first column of 2015 voc df as its not used
voc2015DfAll.drop("Unnamed: 0", axis=1, inplace=True) 

#import co2Slice pickle objects
slicePath = 'Pickle Objects/CO2SliceDict.p'
sliceDict = pickle.load(open(slicePath, "rb" )) #contains df of co2 slice indices and matched movie list

In [14]:
def normalisationStandardScaler(vocScreenings, voc):
    normalisedVOCList = list()
    for screening in vocScreenings:
        scaler = preprocessing.StandardScaler()
        scaler.fit(screening)
        normalisedVOCFrame = scaler.transform(screening)
        normalisedVOCFrame = normalisedVOCFrame.flatten()
        normalisedScreening= pd.DataFrame.from_dict({voc:normalisedVOCFrame})
        normalisedVOCList.append(normalisedScreening)
    return normalisedVOCList  

In [5]:
def generateVOCScreenings(vocDf2013,vocDf2015, sliceDf, matchedMovies):
    screeningList = list()
    prevStartIndex = 0
    startIndex = 0
    vocDf = vocDf2013
    for index in range(0, len(matchedMovies)):
        
        startIndex = sliceDf.loc[index]['start']
        endIndex = sliceDf.loc[index]['end']
        if startIndex == 371: #the 2015 df starts at this index
            vocDf = vocDf2015
        screening = pd.DataFrame(vocDf.iloc[startIndex:endIndex+1,0])
        screeningList.append(screening)
        
        prevStartIndex = startIndex
        
    return screeningList

In [6]:
def normalisation(vocScreenings, voc):
    normalisedVOCList = list()
    for screening in vocScreenings:
        normalisedVOCFrame = copy.deepcopy(screening)
        normalisedVOCFrame = normalisedVOCFrame.values/max(screening.values)
        normalisedVOCFrame = normalisedVOCFrame.flatten()
        normalisedScreening= pd.DataFrame.from_dict({voc:normalisedVOCFrame})
        normalisedVOCList.append(normalisedScreening)
    return normalisedVOCList

In [7]:
#some vocs have NaN measurements during the decided screening times. Ignore these screenings
#also remove empty screenings
def removeNaNScreenings(screenings, randomisedScreenings, matchedMovies):
    screeningList = list()
    randomScreeningList = list()
    movieList = list()
    for screeningIndex in range(0, len(screenings)):
        if not(np.isnan(screenings[screeningIndex].values).any()) and len(screenings[screeningIndex].values) != 0:
            screeningList.append(screenings[screeningIndex])
            randomScreeningList.append(randomisedScreenings[screeningIndex])
            movieList.append(matchedMovies[screeningIndex])
    return screeningList,randomScreeningList,movieList

In [8]:
#column header matching issue between 2013 and 2015 
#e.g. in 2015 column is m356.0711 vs in 2013 is it m356.0714
#assumption being made is that they are the same column so round to 2dp and match
def vocRounding(vocDf):
    vocList = list()
    for index in range(0, len(vocDf.columns)):
        if vocDf.columns[index] == 'Time' or vocDf.columns[index] == 'ocs' or vocDf.columns[index] == 'co' or vocDf.columns[index] == 'CO2':
            vocList.append(vocDf.columns[index])    
        else:
            #string slice to get the molar mass
            voc = vocDf.columns[index]
            mass = (trunc(float(voc[1:])*1000))/1000 #TRUNCATE TO 3DP
            vocList.append(mass)
    return vocList

In [9]:
def createRandomisedVOCScreenings(vocRandomised, runtimeList, movieList ,matchedMovies):
    startIndex = 0
    screeningList = list()
    for movie in matchedMovies:
        try:
            runtime = runtimeList[movieList.index(movie)]
        except ValueError:
            continue
        endIndex = startIndex + runtime
        screening = vocRandomised[startIndex:endIndex]
        screeningList.append(screening)
        startIndex = endIndex
    return screeningList

In [10]:
#user macros
vocSave = False
modelSave = False

#results df
resultsHeader = ['RandomState','VOC','RMSE', 'MAE', 'R2']
resultsList = list()

In [15]:
voc2015Col = vocRounding(voc2015DfAll)
voc2013Col = vocRounding(voc2013DfAll)
voc2013Df = copy.deepcopy(voc2013DfAll)
voc2015Df = copy.deepcopy(voc2015DfAll)
voc2013Df.columns = voc2013Col
voc2015Df.columns = voc2015Col

vocUseList = list()

for vocIndex in range(0, len(voc2015Df.columns)):
    voc = voc2015Df.columns[vocIndex]
    if voc == 'Time':
        continue
    else:
        try:
            indexMask = list(voc2013Df.columns).index(voc)
        except ValueError: #the voc isnt within the 2013 VOC dataset
            continue 
            
        
        #create normal voc screening list
        vocDf2013 = voc2013Df.iloc[:,[indexMask]]
        vocDf2015 = voc2015Df.iloc[:,[vocIndex]]   
        
        #generate screenings
        screeningList = generateVOCScreenings(vocDf2013,vocDf2015, sliceDict['sliceDf'], sliceDict['matchedMovies'])
        matchedMovies = copy.deepcopy(sliceDict['matchedMovies'])
        #use logical vectors to remove all NaNs and create randomised voc lists
        voc2013RandomisedList= vocDf2013.values[np.logical_not(np.isnan(vocDf2013.values))]
        voc2015RandomisedList = vocDf2015.values[np.logical_not(np.isnan(vocDf2015.values))]
        vocRandomised = np.append(voc2013RandomisedList,voc2015RandomisedList, axis=0)
        random.shuffle(vocRandomised)
        #generate randomised screenings
        randomisedScreenings = createRandomisedVOCScreenings(vocRandomised, list(movieRuntimeDf['effective runtime']), list(movieRuntimeDf['movie']) ,matchedMovies)
        randomisedScreeningList = list()
        list(map(lambda screening : randomisedScreeningList.append(pd.DataFrame.from_dict({voc:screening})), randomisedScreenings))
        #remove normal screenings with NaN values in the screenings
        screeningList, randomisedScreeningList, matchedMovies = removeNaNScreenings(screeningList, randomisedScreeningList, matchedMovies)
        #normalise both screenings 
        screeningList = normalisationStandardScaler(screeningList, voc)
        randomisedScreeningList = normalisationStandardScaler(randomisedScreeningList, voc)
        #create randomised and unrandomised list
        vocScreeningDict = {'screenings':screeningList, 'matchedMovies':matchedMovies}
        vocRandomisedScreeningDict = {'screenings':randomisedScreeningList, 'matchedMovies':matchedMovies}
        break
#         RMSE,MAE,R2 = RegressionModel(vocScreeningDict, modelSave,False,False, voc)
#         resultsList.append([False, voc, RMSE,MAE,R2])
#         RMSE,MAE,R2 = RegressionModel(vocRandomisedScreeningDict, modelSave,False,False, voc)
#         resultsList.append([True, voc, RMSE,MAE,R2])

#     #create results Df
#     resultsDf = pd.DataFrame(resultsList,columns=resultsHeader)
#     #write df to output file
#     resultsDf.to_excel("results.xlsx") 
#     resultsDf.to_csv('results.csv', sep=',', encoding='utf-8')

UnboundLocalError: local variable 'normalisedVOCFrame' referenced before assignment

In [18]:
voc2015Col = vocRounding(voc2015DfAll)
voc2013Col = vocRounding(voc2013DfAll)
voc2013Df = copy.deepcopy(voc2013DfAll)
voc2015Df = copy.deepcopy(voc2015DfAll)
voc2013Df.columns = voc2013Col
voc2015Df.columns = voc2015Col

vocUseList = list()
for vocIndex in range(0, len(voc2015Df.columns)):
    voc = voc2015Df.columns[vocIndex]
    if voc == 'Time':
        continue
    else:
        try:
            indexMask = list(voc2013Df.columns).index(voc)
        except ValueError: #the voc isnt within the 2013 VOC dataset
            continue 
            
        for i in range(0,3):
            #create normal voc screening list
            vocDf2013 = voc2013Df.iloc[:,[indexMask]]
            vocDf2015 = voc2015Df.iloc[:,[vocIndex]]

            screeningList = generateVOCScreenings(vocDf2013,vocDf2015, sliceDict['sliceDf'], sliceDict['matchedMovies'])

            matchedMovies = copy.deepcopy(sliceDict['matchedMovies'])
            screeningList = normalisation(screeningList, voc)
            #create randomised voc list
            voc2013RandomisedList = copy.deepcopy(list(vocDf2013[voc]))
            voc2015RandomisedList = copy.deepcopy(list(vocDf2015[voc]))
            random.shuffle(voc2013RandomisedList)
            random.shuffle(voc2015RandomisedList)
            vocDf2013Randomised = pd.DataFrame.from_dict({voc:voc2013RandomisedList})
            vocDf2015Randomised = pd.DataFrame.from_dict({voc:voc2015RandomisedList})
            randomisedScreeningList = generateVOCScreenings(vocDf2013Randomised, vocDf2015Randomised, sliceDict['sliceDf'], sliceDict['matchedMovies'])
            screeningList, randomisedScreeningList, matchedMovies = removeNaNScreenings(screeningList, randomisedScreeningList, matchedMovies)
            entireVocList = np.append(vocDf2013.values, vocDf2015.values)
            randomisedScreeningList = replaceNaNInRandomisedScreenings(randomisedScreeningList,entireVocList)
            randomisedScreeningList = normalisation(randomisedScreeningList, voc)     
            print(randomisedScreeningList[0])
            vocScreeningDict = {'screenings':screeningList, 'matchedMovies':matchedMovies}
            vocRandomisedScreeningDict = {'screenings':randomisedScreeningList, 'matchedMovies':matchedMovies}
            vocUseList.append(voc)


    #         RMSE,MAE,R2 = RegressionModel(vocScreeningDict, modelSave,False,False, voc)
    #         resultsList.append([False, voc, RMSE,MAE,R2])
    #         RMSE,MAE,R2 = RegressionModel(vocRandomisedScreeningDict, modelSave,False,False, voc)
    #         resultsList.append([True, voc, RMSE,MAE,R2])
        break

    # #create results Df
    # resultsDf = pd.DataFrame(resultsList,columns=resultsHeader)
    # #write df to output file
    # resultsDf.to_excel("results.xlsx") 
    # resultsDf.to_csv('results.csv', sep=',', encoding='utf-8')

[          CO2
0    0.473731
1    0.922431
2    0.410175
3    0.214400
4    0.442543
5    0.335381
6    0.419510
7    0.680923
8    0.208591
9    0.234973
10   0.261513
11   0.281921
12   0.425355
13   0.641515
14   0.252878
15   0.882048
16   0.382092
17   0.708657
18   0.700978
19   0.232942
20   0.207222
21   0.342104
22   0.219113
23   0.475610
24   0.226137
25   0.222045
26   0.537328
27   0.279005
28   0.228354
29   0.224408
..        ...
272  0.217417
273  0.304031
274  0.217222
275  0.254003
276  0.301808
277  0.197587
278  0.289815
279  0.232462
280  0.231983
281  0.264171
282  0.526998
283  0.269297
284  0.250159
285  0.628720
286  0.220881
287  0.468950
288  0.223466
289  0.473399
290  0.348043
291  0.234559
292  0.201477
293  0.845173
294  0.245038
295  0.271684
296  0.198952
297  0.480654
298  0.199906
299  0.215562
300  0.235205
301  0.259194

[302 rows x 1 columns],           CO2
0    0.229294
1    0.580255
2    0.198717
3    0.334953
4    0.763464
5    0.199993
6    0.2

[          CO2
0    0.217727
1    0.311664
2    0.225230
3    0.405615
4    0.200667
5    0.290806
6    0.388347
7    0.304856
8    0.219493
9    0.239212
10   0.411700
11   0.918866
12   0.317125
13   0.287378
14   0.409918
15   0.519461
16   0.213034
17   0.203146
18   0.332147
19   0.200697
20   0.568288
21   0.216663
22   0.512408
23   0.224443
24   0.420732
25   0.391540
26   0.212112
27   0.412735
28   0.217404
29   0.217034
..        ...
272  0.246834
273  0.255297
274  0.195650
275  0.214291
276  0.191496
277  0.282798
278  0.248751
279  0.717671
280  0.203626
281  0.254688
282  0.363297
283  0.291872
284  0.366080
285  0.233765
286  0.465974
287  0.376920
288  0.456472
289  0.217350
290  0.288421
291  0.875393
292  0.496079
293  0.200829
294  0.209810
295  0.635581
296  0.782792
297  0.241129
298  0.253848
299  0.270893
300  0.226719
301  0.655338

[302 rows x 1 columns],           CO2
0    0.535920
1    0.192510
2    0.253183
3    0.196972
4    0.497921
5    0.216152
6    0.4

[          CO2
0    0.213425
1    0.307356
2    0.194485
3    0.330590
4    0.271064
5    0.237275
6    0.202375
7    0.789607
8    0.358397
9    0.244877
10   0.439363
11   0.195992
12   0.188998
13   0.195511
14   0.367533
15   0.377725
16   0.575969
17   0.372007
18   0.214523
19   0.462319
20   0.213299
21   0.584095
22   0.205256
23   0.195415
24   0.388777
25   0.225796
26   0.190144
27   0.417804
28   0.202899
29   0.225481
..        ...
272  0.279561
273  0.200045
274  0.220588
275  0.435672
276  0.176748
277  0.283580
278  0.261893
279  0.341180
280  0.260341
281  0.193123
282  0.222213
283  0.206446
284  0.214715
285  0.581058
286  0.328773
287  0.418331
288  0.273275
289  0.234233
290  0.302108
291  0.213287
292  0.567371
293  0.265491
294  0.380664
295  0.341675
296  0.220741
297  0.219155
298  0.297786
299  0.176774
300  0.207867
301  0.685304

[302 rows x 1 columns],           CO2
0    0.212524
1    0.230935
2    0.387024
3    0.328807
4    0.291220
5    0.211382
6    0.2

<h2> 2013 Randomisation Only</h2>

In [9]:
def generateVOCScreenings(vocDf2013, sliceDf, matchedMovies):
    screeningList = list()
    prevStartIndex = 0
    startIndex = 0
    vocDf = vocDf2013
    for index in range(0, len(matchedMovies)):
        
        startIndex = sliceDf.loc[index]['start']
        endIndex = sliceDf.loc[index]['end']
        
        if startIndex == 371: #the 2015 df starts at this index
            return screeningList #return as there is no 2015 data being considered
        print(vocDf)
        screening = pd.DataFrame(vocDf.iloc[startIndex:endIndex+1])
        screeningList.append(screening)
        prevStartIndex = startIndex
        
#normalise the screenings by the max value
def normalisation(vocScreenings, voc):
    normalisedVOCList = list()
    for screening in vocScreenings:
        normalisedVOCFrame = copy.deepcopy(screening)
        normalisedVOCFrame = normalisedVOCFrame.values/max(screening.values)
        normalisedVOCFrame = normalisedVOCFrame.flatten()
        normalisedScreening= pd.DataFrame.from_dict({voc:normalisedVOCFrame})
        normalisedVOCList.append(normalisedScreening)
    return normalisedVOCList

#some vocs dont have the recorded screenings so remove then
#then remove those same screenings from the randomisedScreeningList
def removeNaNScreenings(screenings, randomisedScreenings, matchedMovies):
    screeningList = list()
    randomScreeningList = list()
    movieList = list()
    for screeningIndex in range(0, len(screenings)):
        if not(np.isnan(screenings[screeningIndex].values).any()):
            screeningList.append(screenings[screeningIndex])
            randomScreeningList.append(randomisedScreenings[screeningIndex])
            movieList.append(matchedMovies[screeningIndex])
    return screeningList,randomScreeningList,movieList

#the randomisedScreenings have NaN instances within the 
def replaceNaNInRandomisedScreenings(randomisedScreeningList,entireVocList):
    for screeningIndex in range(0, len(randomisedScreeningList)):
        if (np.isnan(randomisedScreeningList[screeningIndex].values)).any():
            for vocIndex in range(0, len(randomisedScreeningList[screeningIndex].values)):
                voc = randomisedScreeningList[screeningIndex].values[vocIndex]
                if np.isnan(voc[0]):
                    randomIndex = random.randint(0,len(entireVocList)-1)
                    while np.isnan(entireVocList[randomIndex]):
                            #continue generating random numbers if NaN was returned 
                            randomIndex = random.randint(0,len(entireVocList)-1)
                    randomisedScreeningList[screeningIndex].values[vocIndex] = entireVocList[randomIndex]
    return randomisedScreeningList

In [10]:
#read in the various csvs
#2013 Dataset
vocPath = 'Numerical Data/2013VOCData.csv'
voc2013Df = pd.read_csv(vocPath, header = 0, nrows = 74208, low_memory=False)
#note that the 2013 data has 430 measured vocs
movieScreeningsPath = 'Numerical Data/screening_times.csv'
movingScreeningsDf = pd.read_csv(movieScreeningsPath, usecols = ['scheduled','movie','filled %'])
movieRuntimesPath = 'Numerical Data/movie_runtimes.csv'
movieRuntimeDf = pd.read_csv(movieRuntimesPath, usecols = ['movie', 'runtime (mins)', 'effective runtime'])
#import co2Slice pickle objects
slicePath = 'Pickle Objects/CO2SliceDict.p'
sliceDict = pickle.load(open(slicePath, "rb" )) #contains df of co2 slice indices and matched movie list

#user macros
vocSave = False
modelSave = False
randomisationIterations = 100

#results df
resultsHeader = ['RandomState','VOC','RMSE', 'MAE', 'R2']
resultsList = list()

In [11]:
for vocIndex in range(0,54): #allows for running specific vocs through the randomisation process
    voc = voc2013Df.columns[vocIndex]
    if voc == 'Time':
        continue
    else:
        print(voc)
        resultsList = list()

        for i in range(0,randomisationIterations):
            #create normal voc screening list
            vocDf = voc2013Df.loc[:,voc]
            #generate voc screenings
            screeningList = generateVOCScreenings(vocDf, sliceDict['sliceDf'], sliceDict['matchedMovies'])
            matchedMovies = copy.deepcopy(sliceDict['matchedMovies'])
            #normalise the screenings
            screeningList = normalisation(screeningList, voc)
            #create randomised voc list
            voc2013RandomisedList = copy.deepcopy(list(voc2013Df[voc]))
            random.shuffle(voc2013RandomisedList)
            vocDf2013Randomised = pd.DataFrame.from_dict({voc:voc2013RandomisedList})
            randomisedScreeningList = generateVOCScreenings(vocDf2013Randomised, sliceDict['sliceDf'], sliceDict['matchedMovies'])
            #remove screenings from the normal voc screening list with any NaNs within them
            screeningList, randomisedScreeningList, matchedMovies = removeNaNScreenings(screeningList, randomisedScreeningList, matchedMovies)
            #replace any of the remaining NaN's within the randomised screening list (sampling without replacement)
            randomisedScreeningList = replaceNaNInRandomisedScreenings(randomisedScreeningList,vocDf.values)
            #normalise the randomised screening list
            randomisedScreeningList = normalisation(randomisedScreeningList, voc)     

            #create randomised and unrandomised voc dictionary 
            vocScreeningDict = {'screenings':screeningList, 'matchedMovies':matchedMovies}
            vocRandomisedScreeningDict = {'screenings':randomisedScreeningList, 'matchedMovies':matchedMovies}

            #run RF regression on randomised and unrandomised vocs 
            RMSE,MAE,R2 = RegressionModel(vocScreeningDict, modelSave,False,False, voc)
            resultsList.append([False, voc, RMSE,MAE,R2])
            RMSE,MAE,R2 = RegressionModel(vocRandomisedScreeningDict, modelSave,False,False, voc)
            resultsList.append([True, voc, RMSE,MAE,R2])

    #create results Df
    resultsDf = pd.DataFrame(resultsList,columns=resultsHeader)
    #write df to output file
    resultsPath = str(voc) + '.csv'
    resultsDf.to_csv(resultsPath, sep=',', encoding='utf-8')

CO2
0        610.208
1        609.304
2        608.475
3        607.667
4        606.660
5        605.900
6        605.213
7        604.763
8        604.280
9        603.878
10       603.477
11       603.094
12       602.834
13       602.454
14       601.932
15       601.703
16       601.434
17       601.378
18       600.968
19       600.696
20       600.447
21       600.320
22       600.358
23       600.469
24       600.299
25       600.101
26       600.176
27       600.318
28       600.371
29       600.341
          ...   
74178    439.557
74179    439.536
74180    439.516
74181    439.536
74182    439.539
74183    439.475
74184    439.465
74185    439.498
74186    439.540
74187    439.599
74188    439.599
74189    439.623
74190    439.726
74191    439.804
74192    439.794
74193    439.821
74194    439.807
74195    439.773
74196    439.751
74197    439.620
74198    439.381
74199    439.116
74200    438.877
74201    438.728
74202    438.638
74203    438.603
74204    438.632
74205    4

IndexingError: Too many indexers

In [16]:
vocDf.iloc[2:20]

2     608.475
3     607.667
4     606.660
5     605.900
6     605.213
7     604.763
8     604.280
9     603.878
10    603.477
11    603.094
12    602.834
13    602.454
14    601.932
15    601.703
16    601.434
17    601.378
18    600.968
19    600.696
Name: CO2, dtype: float64