In [14]:
import numpy as np
import pandas as pd
import math
from scipy.stats import expon

## Load Dataset

In [2]:
'''
COLUMN NAME - INDEX
gross           8
movie_title    11
language       19
country        20
'''
def loadDataset(filename):
    dataset = pd.read_csv(filename, delimiter=',')
    return np.array(dataset)

In [3]:
# DEBUG
datasetTest = loadDataset('movie_metadata.csv')
datasetTest

array([['Color', 'James Cameron', 723.0, ..., 7.9, 1.78, 33000],
       ['Color', 'Gore Verbinski', 302.0, ..., 7.1, 2.35, 0],
       ['Color', 'Sam Mendes', 602.0, ..., 6.8, 2.35, 85000],
       ...,
       ['Color', 'Benjamin Roberds', 13.0, ..., 6.3, nan, 16],
       ['Color', 'Daniel Hsia', 14.0, ..., 6.3, 2.35, 660],
       ['Color', 'Jon Gunn', 43.0, ..., 6.6, 1.85, 456]], dtype=object)

## Clearing Dataset

In [4]:
# TODO: Remove line limite  
def clearingDatasetQuery1(dataset):
    subset = dataset[0:200, [8, 11]]
    new_dataset = list()
    for record in subset:
        if not(math.isnan(record[0])):
            new_dataset.append(record)
    return np.array(new_dataset)

In [5]:
cleanDataset = clearingDatasetQuery1(datasetTest)

## Possible values

In [6]:
def possibleValues(dataset):
    indexRecord = 0
    titleUniqueMovies = list()
    cleanDataset = clearingDatasetQuery1(dataset)
    removingReplicates = np.copy(cleanDataset)
    
    for record in cleanDataset:
        if record[1] not in titleUniqueMovies:
            titleUniqueMovies.append(record[1])
        else:
            removingReplicates = np.delete(removingReplicates, indexRecord, 0)
            indexRecord -= 1
            
        indexRecord += 1
       
    return np.array(removingReplicates)

## Queries

In [7]:
def query1(dataset):
    maxGross = np.max(dataset[:, 0], axis=0)
    return dataset[np.where(dataset[:,0] == maxGross)][0]

## Score Functions

In [8]:
def cartesianProduct(attributeA, attributeB):
    return np.transpose([np.tile(attributeA, len(attributeB)),
                         np.repeat(attributeB, len(attributeA))])

In [9]:
def scoreFunctionQuery1(dataset, output):
    score = list()
       
    for record in dataset:       
        if record[1] == output:
            score.append(record[0])
        else:
            score.append(0)

    return score

## Sensitivities

In [10]:
def sensitivityQuery1(dataset):
    datasetWithoutCurrentOutput = dataset[:,:]
    
    maxScore = float('-inf')
    indexOutput = 0
    
    for output in dataset:
        lenDataset = len(datasetWithoutCurrentOutput)
        datasetWithoutCurrentOutput = datasetWithoutCurrentOutput[np.delete(np.array(range(lenDataset)), 0),:]
        neighborDataset = datasetWithoutCurrentOutput[:,:]
        
        scoreQ1 = scoreFunctionQuery1(dataset, output[1])
        maxScoreQ1 = np.max(scoreQ1)
        minScoreQ1 = float('inf')
    
        for index in range(lenDataset - 1):
            lenNeighborDataset = len(neighborDataset)
            neighborDataset = neighborDataset[np.delete(np.array(range(lenNeighborDataset)), 0),:]
            
            for newOutput in neighborDataset:
                newScoreQ1 = scoreFunctionQuery1(neighborDataset, newOutput[1])
                minScoreNeighbor = np.min(newScoreQ1)

            if minScoreQ1 > minScoreNeighbor:
                minScoreQ1 = minScoreNeighbor 
                ScoreDifference = abs(maxScoreQ1 - minScoreQ1)

                if maxScore < ScoreDifference:
                    maxScore = ScoreDifference
        
        indexOutput += 1
    
    return maxScore

In [11]:
cleanDataset = clearingDatasetQuery1(datasetTest)
# len(cleanDataset)
sensQ1 = sensitivityQuery1(cleanDataset)

In [12]:
sensQ1

760505847.0

## Mechanism

In [15]:
sensitivity = 760505847.0
budget = 0.1
np.random.seed(7)
expon.rvs(loc = 0, scale = sensitivity/budget)

603666038.0784853

In [44]:
def outputRandomized(budget, sensitivity):
    np.random.seed(7)
    output = expon.rvs(loc = 0, scale = sensitivity/budget)
    return output

In [45]:
def randomizedQuery1(dataset):
    grosses = dataset[:, 0]

    values = list()
    for gross in grosses:
#         value = abs(gross - outputRandomized(0.1, sensitivityQuery1(dataset)))
        value = abs(gross - outputRandomized(0.1, 760505847.0))
        values.append(value)

    return dataset[np.argmin(values)]

In [46]:
randomizedQuery1(cleanDataset)

array([623279547.0, 'The Avengers\xa0'], dtype=object)

In [None]:
def main(filename, budgets):
    dataset = loadDataset(filename)
    cleanDataset = possibleValues(datasetTest)
    
    sensitivityQ1 = sensitivity(cleanDataset, output)
    result = randomizedQuery1(budgets, sensitivityQ1)
    print(result)

In [None]:
main('movie_metadata.csv', [])