In [1]:
import numpy as np
import pandas as pd
import math
from scipy.stats import expon

## Load Dataset

In [2]:
'''
COLUMN NAME - INDEX
gross           8
movie_title    11
language       19
country        20
'''
def loadDataset(filename):
    dataset = pd.read_csv(filename, delimiter=',')
    return np.array(dataset)

In [50]:
# DEBUG
datasetTest = loadDataset('movie_metadata.csv')
datasetTest

array([['Color', 'James Cameron', 723.0, ..., 7.9, 1.78, 33000],
       ['Color', 'Gore Verbinski', 302.0, ..., 7.1, 2.35, 0],
       ['Color', 'Sam Mendes', 602.0, ..., 6.8, 2.35, 85000],
       ...,
       ['Color', 'Benjamin Roberds', 13.0, ..., 6.3, nan, 16],
       ['Color', 'Daniel Hsia', 14.0, ..., 6.3, 2.35, 660],
       ['Color', 'Jon Gunn', 43.0, ..., 6.6, 1.85, 456]], dtype=object)

## Clearing Dataset

In [4]:
# TODO: Remove line limite  
def clearingDatasetQuery1(dataset):
    subset = dataset[0:100, [8, 11]]
    new_dataset = list()
    for record in subset:
        if not(math.isnan(record[0])):
            new_dataset.append(record)
    return np.array(new_dataset)

In [86]:
# TODO: Remove line limite  
def clearingDatasetQuery2(dataset):   
    subset = dataset[300:400, [8, 11, 19]]
    new_dataset = list()
    for record in subset:
        if not(math.isnan(record[0])):
            new_dataset.append(record)
    return np.array(new_dataset)

In [6]:
def possibleValues(dataset, query):  
    if query == 1:
        cleanDataset = clearingDatasetQuery1(dataset)
    elif query == 2:
        cleanDataset = clearingDatasetQuery2(dataset)
    elif query == 3:
        cleanDataset = clearingDatasetQuery3(dataset)
    else:
        print('Consulta inválida')
        return None
       
    indexRecord = 0
    titleUniqueMovies = list()
    removingReplicates = np.copy(cleanDataset)
    
    for record in cleanDataset:
        if record[1] not in titleUniqueMovies:
            titleUniqueMovies.append(record[1])
        else:
            removingReplicates = np.delete(removingReplicates, indexRecord, 0)
            indexRecord -= 1
            
        indexRecord += 1
       
    return np.array(removingReplicates)

In [7]:
# DEBUG
datasetQuery1 = possibleValues(datasetTest, 1)

In [87]:
# DEBUG
datasetQuery2 = possibleValues(datasetTest, 2)

In [23]:
def splitDataset(dataset):
    movieDict = dict()
    maxGrossPerLanguage = dict()
    
    for record in dataset:
        if record[2] in movieDict.keys():
            movieDict[record[2]].append(record)
        else:
            movieDict[record[2]] = []
            movieDict[record[2]].append(record)
    return movieDict

## Queries

In [9]:
def query1(dataset):
    maxGross = np.max(dataset[:, 0], axis=0)
    return dataset[np.where(dataset[:,0] == maxGross)][0]

In [10]:
def query2(dataset):
    movieDict = dict()
    maxGrossPerLanguage = dict()
    
    for record in dataset:
        if record[2] in movieDict.keys():
            movieDict[record[2]].append(record)
        else:
            movieDict[record[2]] = []
            movieDict[record[2]].append(record)
        
    for key in movieDict.keys():
        maxGross = float('-inf')
        movieName = None
        for value in movieDict[key]:
            if value[0] > maxGross:
                maxGross = value[0]
                movieName = value[1]

        maxGrossPerLanguage[key] = (maxGross, movieName)

    return maxGrossPerLanguage

In [11]:
# DEBUG
query1(datasetQuery1)

array([760505847.0, 'Avatar\xa0'], dtype=object)

In [88]:
# DEBUG
query2(datasetQuery2)

{'English': (380838870.0, 'Finding Nemo\xa0'),
 'Mandarin': (9213.0, 'The Flowers of War\xa0'),
 'Aboriginal': (72515360.0, 'The Interpreter\xa0')}

## Score Functions

In [13]:
def scoreFunctionQuery1(dataset, output):
    score = list()
       
    for record in dataset:       
        if record[1] == output:
            score.append(record[0])
        else:
            score.append(0)

    return score

In [62]:
def scoreFunctionQuery2(dataset, output):
    score = list()
       
    for record in dataset:       
        if record[1] == output:
            score.append(record[0])
        else:
            score.append(0)

    return score

## Sensitivities

In [15]:
def sensitivityQuery1(dataset):
    datasetWithoutCurrentOutput = dataset[:,:]
    
    maxScore = float('-inf')
    indexOutput = 0
    
    for output in dataset:
        lenDataset = len(datasetWithoutCurrentOutput)
        datasetWithoutCurrentOutput = datasetWithoutCurrentOutput[np.delete(np.array(range(lenDataset)), 0),:]
        neighborDataset = datasetWithoutCurrentOutput[:,:]
        
        scoreQ1 = scoreFunctionQuery1(dataset, output[1])
        maxScoreQ1 = np.max(scoreQ1)
        minScoreQ1 = float('inf')
    
        for index in range(lenDataset - 1):
            lenNeighborDataset = len(neighborDataset)
            neighborDataset = neighborDataset[np.delete(np.array(range(lenNeighborDataset)), 0),:]
            
            for newOutput in neighborDataset:
                newScoreQ1 = scoreFunctionQuery1(neighborDataset, newOutput[1])
                minScoreNeighbor = np.min(newScoreQ1)

            if minScoreQ1 > minScoreNeighbor:
                minScoreQ1 = minScoreNeighbor 
                ScoreDifference = abs(maxScoreQ1 - minScoreQ1)

                if maxScore < ScoreDifference:
                    maxScore = ScoreDifference
        
        indexOutput += 1
    
    return maxScore

In [80]:
# TODO: Esta função deve calcular a sensibilidade de cada saída para o dataset de seu idioma.
# Saídas de idiomas diferentes não fazem parte do dataset, portanto, é necessário dividir o dataset
# por idioma e a partir disso calcular a sensibilidade

# TODO: Rever os casos para um unico filme de um idioma. 
# O dataset vizinho é vazio e portanto o valor da sensibilidade é -inf

def sensitivityQuery2(dataset):   
    datasetPerLanguage = splitDataset(dataset)
    maxScorePerLanguage = dict()
    
    for language in datasetPerLanguage.keys():
        maxScore = float('-inf')
        indexOutput = 0
        newDataset = list()
        
        for record in datasetPerLanguage[language]:
            newDataset.append(record)
        
        newDataset = np.array(newDataset)
        datasetWithoutCurrentOutput = newDataset[:,:]

        for output in newDataset:
            lenDataset = len(datasetWithoutCurrentOutput)
            datasetWithoutCurrentOutput = datasetWithoutCurrentOutput[np.delete(np.array(range(lenDataset)), 0),:]
            neighborDataset = datasetWithoutCurrentOutput[:,:]

            scoreQ2 = scoreFunctionQuery2(newDataset, output[1])

            maxScoreQ2 = np.max(scoreQ2)
            minScoreQ2 = float('inf')

            for index in range(lenDataset - 1):
                lenNeighborDataset = len(neighborDataset)
                neighborDataset = neighborDataset[np.delete(np.array(range(lenNeighborDataset)), 0),:]

                for newOutput in neighborDataset:
                    newScoreQ2 = scoreFunctionQuery2(neighborDataset, newOutput[1])
                    minScoreNeighbor = np.min(newScoreQ2)

                if minScoreQ2 > minScoreNeighbor:
                    minScoreQ2 = minScoreNeighbor 
                    ScoreDifference = abs(maxScoreQ2 - minScoreQ2)

                    if maxScore < ScoreDifference:
                        maxScore = ScoreDifference
            
            indexOutput += 1
        if maxScore == float('-inf'):
            maxScore = maxScoreQ2
            
        maxScorePerLanguage[language] = maxScore
    
    return maxScorePerLanguage

In [52]:
# DEBUG
sensQ1 = sensitivityQuery1(datasetQuery1)
sensQ1

760505847.0

In [89]:
# DEBUG
sensQ2 = sensitivityQuery2(datasetQuery2)
sensQ2

{'English': 380838870.0, 'Mandarin': 9213.0, 'Aboriginal': 72515360.0}

## Mechanism

In [18]:
def outputRandomized(budget, sensitivity):
    exp = expon.rvs(loc = 0, scale = sensitivity/budget)
    return exp

In [19]:
def randomizedQuery1(dataset, budget, sensitivity):
    grosses = dataset[:, 0]
    values = list()
    for gross in grosses:
        value = abs(gross - outputRandomized(budget, sensitivity))
        values.append(value)

    return dataset[np.argmin(values)]

In [20]:
# DEBUG
randomizedQuery1(datasetQuery1, 0.1, sensQ1)

array([200074175.0, 'Spectre\xa0'], dtype=object)

In [21]:
def main(filename, budget):
    dataset = loadDataset(filename)
#     cleanDataset = possibleValues(datasetTest)
    
#     sensitivityQ1 = sensitivityQuery1(cleanDataset)
#     result = randomizedQuery1(cleanDataset, budget, sensitivityQ1)
#     print(result)

In [22]:
main('movie_metadata.csv', [])