In [1]:
import numpy as np
import pandas as pd
import math
from scipy.stats import rv_discrete
from collections import Counter
import operator
from random import shuffle

## Load Dataset

In [2]:
'''
COLUMN NAME - INDEX
gross           8
movie_title    11
language       19
country        20
'''
def loadDataset(filename):
    dataset = pd.read_csv(filename, delimiter=',')
    return np.array(dataset)

## Clearing Dataset

In [4]:
# TODO: Remove line limite  
def clearingDatasetQuery1(dataset):
    subset = dataset[0:100, [8, 11]]
    new_dataset = list()
    for record in subset:
        if not(math.isnan(record[0])):
            new_dataset.append(record)
    return np.array(new_dataset)

In [5]:
# TODO: Remove line limite  
def clearingDatasetQuery2(dataset):   
    subset = dataset[300:400, [8, 11, 19]]
    new_dataset = list()
    for record in subset:
        if not(math.isnan(record[0])):
            new_dataset.append(record)
    return np.array(new_dataset)

In [6]:
# TODO: Remove line limite  
def clearingDatasetQuery3(dataset):   
    subset = dataset[300:400, [20, 11]]
    new_dataset = list()
    for record in subset:
        if not(record[1] == 'NaN'):
            new_dataset.append(record)

    new_dataset = np.array(new_dataset)

    countOccurrences = Counter(new_dataset[:,0])
    countOccurrencesSorted = sorted(countOccurrences.items(), 
                                    key = lambda kv:(kv[1], kv[0]), 
                                    reverse=True)
    

    newDataset = list()
    languages = list()
    counts = list()
    
    for language, count in countOccurrencesSorted:
        languages.append(language)
        counts.append(count)
    
    newDataset = np.array(pd.DataFrame({'language':languages, 'count':counts}))
    
    return newDataset

In [7]:
def possibleValues(dataset, query):  
    if query == 1:
        cleanDataset = clearingDatasetQuery1(dataset)
    elif query == 2:
        cleanDataset = clearingDatasetQuery2(dataset)
    else:
        print('Consulta inválida')
        return None
       
    indexRecord = 0
    titleUniqueMovies = list()
    removingReplicates = np.copy(cleanDataset)
    
    for record in cleanDataset:
        if record[1] not in titleUniqueMovies:
            titleUniqueMovies.append(record[1])
        else:
            removingReplicates = np.delete(removingReplicates, indexRecord, 0)
            indexRecord -= 1
            
        indexRecord += 1
       
    return np.array(removingReplicates)

In [8]:
def splitDataset(dataset):
    movieDict = dict()
    maxGrossPerLanguage = dict()
    
    for record in dataset:
        if record[2] in movieDict.keys():
            movieDict[record[2]].append(record)
        else:
            movieDict[record[2]] = []
            movieDict[record[2]].append(record)
    return movieDict

## Queries

In [9]:
def query1(dataset):
    maxGross = np.max(dataset[:, 0], axis=0)
    return dataset[np.where(dataset[:,0] == maxGross)][0]

In [10]:
def query2(dataset):
    movieDict = dict()
    maxGrossPerLanguage = dict()
    
    for record in dataset:
        if record[2] in movieDict.keys():
            movieDict[record[2]].append(record)
        else:
            movieDict[record[2]] = []
            movieDict[record[2]].append(record)
        
    for key in movieDict.keys():
        maxGross = float('-inf')
        movieName = None
        for value in movieDict[key]:
            if value[0] > maxGross:
                maxGross = value[0]
                movieName = value[1]

        maxGrossPerLanguage[key] = (maxGross, movieName)

    return maxGrossPerLanguage

In [11]:
def query3(dataset):
    top3 = dataset[:3]
    
    return dict((language, occurrences) for language, occurrences in top3)

## Score Functions

In [12]:
def scoreFunctionQuery1(dataset, output):
    score = list()
       
    for record in dataset:       
        if record[1] == output:
            score.append(record[0])
        else:
            score.append(0)

    return score

In [13]:
def scoreFunctionQuery2(dataset, output):
    score = list()
       
    for record in dataset:       
        if record[1] == output:
            score.append(record[0])
        else:
            score.append(0)

    return score

In [14]:
def scoreFunctionQuery3(dataset, output):
    score = list()
       
    for record in dataset:       
        if record[0] == output:
            score.append(record[1])
        else:
            score.append(0)

    return score

## Sensitivities

In [15]:
def sensitivityQuery1(dataset):
    datasetWithoutCurrentOutput = dataset[:,:]
    
    maxScore = float('-inf')
    indexOutput = 0
    
    for output in dataset:
        lenDataset = len(datasetWithoutCurrentOutput)
        datasetWithoutCurrentOutput = datasetWithoutCurrentOutput[np.delete(np.array(range(lenDataset)), 0),:]
        neighborDataset = datasetWithoutCurrentOutput[:,:]
        
        scoreQ1 = scoreFunctionQuery1(dataset, output[1])
        maxScoreQ1 = np.max(scoreQ1)
        minScoreQ1 = float('inf')
    
        for index in range(lenDataset - 1):
            lenNeighborDataset = len(neighborDataset)
            neighborDataset = neighborDataset[np.delete(np.array(range(lenNeighborDataset)), 0),:]
            
            for newOutput in neighborDataset:
                newScoreQ1 = scoreFunctionQuery1(neighborDataset, newOutput[1])
                minScoreNeighbor = np.min(newScoreQ1)

                if minScoreQ1 > minScoreNeighbor:
                    minScoreQ1 = minScoreNeighbor 
                    ScoreDifference = abs(maxScoreQ1 - minScoreQ1)

                    if maxScore < ScoreDifference:
                        maxScore = ScoreDifference
        
        indexOutput += 1
    
    return maxScore

In [16]:
def sensitivityQuery2(dataset):   
    datasetPerLanguage = splitDataset(dataset)
    maxScorePerLanguage = dict()
    
    for language in datasetPerLanguage.keys():
        maxScore = float('-inf')
        indexOutput = 0
        newDataset = list()
        
        for record in datasetPerLanguage[language]:
            newDataset.append(record)
        
        newDataset = np.array(newDataset)
        datasetWithoutCurrentOutput = newDataset[:,:]

        for output in newDataset:
            lenDataset = len(datasetWithoutCurrentOutput)
            datasetWithoutCurrentOutput = datasetWithoutCurrentOutput[np.delete(np.array(range(lenDataset)), 0),:]
            neighborDataset = datasetWithoutCurrentOutput[:,:]

            scoreQ2 = scoreFunctionQuery2(newDataset, output[1])

            maxScoreQ2 = np.max(scoreQ2)
            minScoreQ2 = float('inf')

            for index in range(lenDataset - 1):
                lenNeighborDataset = len(neighborDataset)
                neighborDataset = neighborDataset[np.delete(np.array(range(lenNeighborDataset)), 0),:]

                for newOutput in neighborDataset:
                    newScoreQ2 = scoreFunctionQuery2(neighborDataset, newOutput[1])
                    minScoreNeighbor = np.min(newScoreQ2)

                    if minScoreQ2 > minScoreNeighbor:
                        minScoreQ2 = minScoreNeighbor 
                        ScoreDifference = abs(maxScoreQ2 - minScoreQ2)

                        if maxScore < ScoreDifference:
                            maxScore = ScoreDifference
            
            indexOutput += 1
        if maxScore == float('-inf'):
            maxScore = maxScoreQ2
            
        maxScorePerLanguage[language] = maxScore
    
    return maxScorePerLanguage

In [17]:
def sensitivityQuery3(dataset):
    datasetWithoutCurrentOutput = dataset[:,:]
    
    maxScore = float('-inf')
    indexOutput = 0
    
    for output in dataset:
        lenDataset = len(datasetWithoutCurrentOutput)
        datasetWithoutCurrentOutput = datasetWithoutCurrentOutput[np.delete(np.array(range(lenDataset)), 0),:]
        neighborDataset = datasetWithoutCurrentOutput[:,:]
        
        scoreQ3 = scoreFunctionQuery3(dataset, output[0])
        maxScoreQ3 = np.max(scoreQ3)
        
        minScoreQ3 = float('inf')
    
        for index in range(lenDataset - 1):
            lenNeighborDataset = len(neighborDataset)
            neighborDataset = neighborDataset[np.delete(np.array(range(lenNeighborDataset)), 0),:]
            
            for newOutput in neighborDataset:
                newScoreQ3 = scoreFunctionQuery3(neighborDataset, newOutput[0])
                minScoreNeighbor = np.min(newScoreQ3)

                if minScoreQ3 > minScoreNeighbor:
                    minScoreQ3 = minScoreNeighbor 
                    ScoreDifference = abs(maxScoreQ3 - minScoreQ3)

                    if maxScore < ScoreDifference:
                        maxScore = ScoreDifference
        
        indexOutput += 1
    
    return maxScore

## Mechanism

In [18]:
def calculateExp(score, budget, sensitivity):
    return math.exp((budget * score) / (2 * sensitivity))

In [19]:
def calculateOutputProbabilityQuery1(dataset, budget, sensitivity):
    expList = list()
    probabilityList = list()
    
    for output in dataset:
        score = np.max(scoreFunctionQuery1(dataset, output[1]))
        expList.append(calculateExp(score, budget, sensitivity))
        
    sumExp = np.sum(expList)
    
    for value in expList:
        probabilityList.append(value/sumExp)
        
    return probabilityList

In [20]:
def calculateOutputProbabilityQuery2(dataset, language, budget, sensitivity):
    expList = list()
    probabilityList = list()
    
    for output in dataset:        
        score = np.max(scoreFunctionQuery2(dataset, output[1]))
        expList.append(calculateExp(score, budget, sensitivity[language]))

    sumExp = np.sum(expList)
    
    for value in expList:
        probabilityList.append(value/sumExp)
        
    return probabilityList

In [21]:
def calculateOutputProbabilityQuery3(dataset, budget, sensitivity):
    expList = list()
    probabilityList = list()
    
    for output in dataset:
        score = np.max(scoreFunctionQuery3(dataset, output[0]))
        expList.append(calculateExp(score, budget, sensitivity))
        
    sumExp = np.sum(expList)
    
    for value in expList:
        probabilityList.append(value/sumExp)
        
    return probabilityList

In [22]:
def randomizedQuery1(dataset, budget, sensitivity):
    lenDataset = len(dataset)
    listProbabilities = calculateOutputProbabilityQuery1(dataset, budget, sensitivity)
    distribution = rv_discrete(values = (np.array(range(lenDataset)) , listProbabilities))
    return dataset[distribution.rvs(size=1), :]

In [23]:
def randomizedQuery2(dataset, budget, sensitivity):
    datasetPerLanguage = splitDataset(dataset)
    grossPerLanguage = dict()
    
    for language in datasetPerLanguage.keys():
        newDataset = list()
        
        for record in datasetPerLanguage[language]:
            newDataset.append(record)
        
        newDataset = np.array(newDataset)
        
        lenDataset = len(newDataset)
        listProbabilities = calculateOutputProbabilityQuery2(newDataset, language, budget, sensitivity)
        distribution = rv_discrete(values = (np.array(range(lenDataset)) , listProbabilities))
        grossPerLanguage[language] = newDataset[distribution.rvs(size=1), :]
        
    return grossPerLanguage

In [24]:
def randomizedQuery3(dataset, budget, sensitivity):
    budget = budget/3
    
    rankedList = list()
    lenDataset = len(dataset)
    
    for rank in range(3):        
        listProbabilities = calculateOutputProbabilityQuery3(dataset, budget, sensitivity)
        distribution = rv_discrete(values = (np.array(range(lenDataset)) , listProbabilities))
        indexOutput = distribution.rvs(size=1)
        rankedList.append(dataset[indexOutput, :])
                
        dataset = dataset[np.delete(np.array(range(lenDataset)), indexOutput),:]
        lenDataset = len(dataset)
        
    return rankedList

In [25]:
def main(filename, budgets):
    dataset = loadDataset(filename)
    
    datasetQuery1 = possibleValues(dataset, 1)
    datasetQuery2 = possibleValues(dataset, 2)
    datasetQuery3 = clearingDatasetQuery3(dataset)
    
    q1 = query1(datasetQuery1)
    q2 = query2(datasetQuery2)
    q3 = query3(datasetQuery3)
    
    sensQ1 = sensitivityQuery1(datasetQuery1)
    sensQ2 = sensitivityQuery2(datasetQuery2)
    sensQ3 = sensitivityQuery3(datasetQuery3)
    
    for budget in budgets:
        
        randomized1 = randomizedQuery1(datasetQuery1, budget, sensQ1)
        randomized2 = randomizedQuery2(datasetQuery2, budget, sensQ2)
        randomized3 = randomizedQuery3(datasetQuery3, budget, sensQ3)
        
        print('Budget {} \n'.format(budget))
        
        print('Original query 1 = {} '.format(q1))
        print('Randomized query 1 = {} '.format(randomized1))
        print('Sensitivity query 1 = {} '.format(sensQ1))
        print('\n')
        
        print('Original query 2 = {} '.format(q2))
        print('Randomized query 2 = {} '.format(randomized2))
        print('Sensitivity query 2 = {} '.format(sensQ2))
        print('\n')
        
        print('Original query 3 = {} '.format(q3))
        print('Randomized query 3 = {} '.format(randomized3))
        print('Sensitivity query 3 = {} '.format(sensQ3))
        print('\n')

In [26]:
main('movie_metadata.csv', [0.1, 1, 10])

Budget 0.1 

Original query 1 = [760505847.0 'Avatar\xa0'] 
Randomized query 1 = [[234903076.0 'Oz the Great and Powerful\xa0']] 
Sensitivity query 1 = 760505847.0 


Original query 2 = {'English': (380838870.0, 'Finding Nemo\xa0'), 'Mandarin': (9213.0, 'The Flowers of War\xa0'), 'Aboriginal': (72515360.0, 'The Interpreter\xa0')} 
Randomized query 2 = {'English': array([[34964818.0, 'Pan\xa0', 'English']], dtype=object), 'Mandarin': array([[9213.0, 'The Flowers of War\xa0', 'Mandarin']], dtype=object), 'Aboriginal': array([[72515360.0, 'The Interpreter\xa0', 'Aboriginal']], dtype=object)} 
Sensitivity query 2 = {'English': 380838870.0, 'Mandarin': 9213.0, 'Aboriginal': 72515360.0} 


Original query 3 = {'USA': 88, 'UK': 5, 'Germany': 4} 
Randomized query 3 = [array([['UK', 5]], dtype=object), array([['France', 2]], dtype=object), array([['Germany', 4]], dtype=object)] 
Sensitivity query 3 = 88 


Budget 1 

Original query 1 = [760505847.0 'Avatar\xa0'] 
Randomized query 1 = [[90755643.