In [1]:
import numpy as np
import pandas as pd
import math
from scipy.stats import rv_discrete
from collections import Counter
import operator
from random import shuffle

## Load Dataset

In [2]:
'''
COLUMN NAME - INDEX
gross           8
movie_title    11
language       19
country        20
'''
def loadDataset(filename):
    dataset = pd.read_csv(filename, delimiter=',')
    return np.array(dataset)

## Clearing Dataset

In [3]:
def clearingDatasetQuery1(dataset):
    subset = dataset[:, [8, 11]]
    new_dataset = list()
    for record in subset:
        if not(math.isnan(record[0])):
            new_dataset.append(record)
    return np.array(new_dataset)

In [4]:
# TODO: Remove line limite  
def clearingDatasetQuery2(dataset):   
    subset = dataset[:, [8, 11, 19]]
    new_dataset = list()
    for record in subset:
        if not(math.isnan(record[0])):
            new_dataset.append(record)
    return np.array(new_dataset)

In [5]:
def clearingDatasetQuery3(dataset):   
    subset = dataset[:, [20, 11]]
    df = pd.DataFrame({'country':subset[:,0], 'title':subset[:,1]})
    new_dataset = np.array(df.dropna())
    
    countOccurrences = Counter(new_dataset[:,0]) 

    newDataset = list()
    languages = list()
    counts = list()
    
    for language in countOccurrences:
        languages.append(language)
        counts.append(countOccurrences[language])
    newDataset = np.array(pd.DataFrame({'language':languages, 'count':counts}))
    
    return newDataset

In [6]:
def possibleValues(dataset, query):  
    if query == 1:
        cleanDataset = clearingDatasetQuery1(dataset)
    elif query == 2:
        cleanDataset = clearingDatasetQuery2(dataset)
    else:
        print('Consulta inválida')
        return None
       
    indexRecord = 0
    titleUniqueMovies = list()
    removingReplicates = np.copy(cleanDataset)
    
    for record in cleanDataset:
        if record[1] not in titleUniqueMovies:
            titleUniqueMovies.append(record[1])
        else:
            removingReplicates = np.delete(removingReplicates, indexRecord, 0)
            indexRecord -= 1
            
        indexRecord += 1
       
    return np.array(removingReplicates)

In [7]:
def splitDataset(dataset):
    movieDict = dict()
    maxGrossPerLanguage = dict()
    
    for record in dataset:
        if record[2] in movieDict.keys():
            movieDict[record[2]].append(record)
        else:
            movieDict[record[2]] = []
            movieDict[record[2]].append(record)
    return movieDict

## Queries

In [8]:
def query1(dataset):
    maxGross = np.max(dataset[:, 0], axis=0)
    return dataset[np.where(dataset[:,0] == maxGross)][0]

In [9]:
def query2(dataset):
    movieDict = dict()
    maxGrossPerLanguage = dict()
    
    for record in dataset:
        if record[2] in movieDict.keys():
            movieDict[record[2]].append(record)
        else:
            movieDict[record[2]] = []
            movieDict[record[2]].append(record)
        
    for key in movieDict.keys():
        maxGross = float('-inf')
        movieName = None
        for value in movieDict[key]:
            if value[0] > maxGross:
                maxGross = value[0]
                movieName = value[1]

        maxGrossPerLanguage[key] = (maxGross, movieName)

    return maxGrossPerLanguage

In [10]:
def query3(dataset):  
    return sorted(dataset, key=lambda row: row[1], reverse=True)[:3]

## Score Functions

In [11]:
def scoreFunctionQuery1(dataset, output):
    
    score = list()
       
    for record in dataset:       
        if record[1] == output:
            score.append(record[0])
        else:
            score.append(0)

    return score

In [12]:
def scoreFunctionQuery2(dataset, output):
    score = list()
       
    for record in dataset:       
        if record[1] == output:
            score.append(record[0])
        else:
            score.append(0)

    return score

In [13]:
def scoreFunctionQuery3(dataset, output):
    score = list()
       
    for record in dataset:       
        if record[0] == output:
            score.append(record[1])
        else:
            score.append(0)

    return score

## Sensitivities

In [14]:
def sensitivityQuery1(dataset):
    return np.max(dataset[:,0])

In [15]:
def sensitivityQuery2(dataset):
    datasetPerLanguage = splitDataset(dataset)
    maxScorePerLanguage = dict()
    
    for language in datasetPerLanguage.keys():
        maxScorePerLanguage[language] = np.max(np.array(datasetPerLanguage[language])[:,0])

    return maxScorePerLanguage

In [16]:
def sensitivityQuery3(dataset):
    return np.max(dataset[:,1])

## Mechanism

In [17]:
def calculateExp(score, budget, sensitivity):
    return math.exp((budget * score) / (2 * sensitivity))

In [18]:
def calculateOutputProbabilityQuery1(dataset, budget, sensitivity):
    expList = list()
    probabilityList = list()
    
    for output in dataset:
        score = np.max(scoreFunctionQuery1(dataset, output[1]))
        expList.append(calculateExp(score, budget, sensitivity))
        
    sumExp = np.sum(expList)
    
    for value in expList:
        probabilityList.append(value/sumExp)
        
    return probabilityList

In [19]:
def calculateOutputProbabilityQuery2(dataset, language, budget, sensitivity):
    expList = list()
    probabilityList = list()
    
    for output in dataset:        
        score = np.max(scoreFunctionQuery2(dataset, output[1]))
        expList.append(calculateExp(score, budget, sensitivity[language]))

    sumExp = np.sum(expList)
    
    for value in expList:
        probabilityList.append(value/sumExp)
        
    return probabilityList

In [20]:
def calculateOutputProbabilityQuery3(dataset, budget, sensitivity):
    expList = list()
    probabilityList = list()
    
    for output in dataset:
        score = np.max(scoreFunctionQuery3(dataset, output[0]))
        expList.append(calculateExp(score, budget, sensitivity))
        
    sumExp = np.sum(expList)
    
    for value in expList:
        probabilityList.append(value/sumExp)
        
    return probabilityList

In [21]:
def randomizedQuery1(dataset, budget, sensitivity):
    lenDataset = len(dataset)
    listProbabilities = calculateOutputProbabilityQuery1(dataset, budget, sensitivity)
    distribution = rv_discrete(values = (np.array(range(lenDataset)) , listProbabilities))
    return dataset[distribution.rvs(size=1), :]

In [22]:
def randomizedQuery2(dataset, budget, sensitivity):
    datasetPerLanguage = splitDataset(dataset)
    grossPerLanguage = dict()
    
    for language in datasetPerLanguage.keys():
        newDataset = list()
        
        for record in datasetPerLanguage[language]:
            newDataset.append(record)
        
        newDataset = np.array(newDataset)
        
        lenDataset = len(newDataset)
        listProbabilities = calculateOutputProbabilityQuery2(newDataset, language, budget, sensitivity)
        distribution = rv_discrete(values = (np.array(range(lenDataset)) , listProbabilities))
        grossPerLanguage[language] = newDataset[distribution.rvs(size=1), :]
        
    return grossPerLanguage

In [23]:
def randomizedQuery3(dataset, budget, sensitivity):
    budget = budget/3
    
    rankedList = list()
    lenDataset = len(dataset)
    
    for rank in range(3):        
        listProbabilities = calculateOutputProbabilityQuery3(dataset, budget, sensitivity)
        distribution = rv_discrete(values = (np.array(range(lenDataset)) , listProbabilities))
        indexOutput = distribution.rvs(size=1)
        rankedList.append(dataset[indexOutput, :])
                
        dataset = dataset[np.delete(np.array(range(lenDataset)), indexOutput),:]
        lenDataset = len(dataset)
        
    return rankedList

In [24]:
import csv

listabudget = []
listaR1 = []
listaR2 = []
listaR3 = []
listaS1 = []
listaS2 = []
listaS3 = []


def main(filename, budgets):
#     with open('result.csv', 'w') as f:
    resultado = open("result.csv",'w')
    
    dataset = loadDataset(filename)

    datasetQuery1 = possibleValues(dataset, 1)
    datasetQuery2 = possibleValues(dataset, 2)
    datasetQuery3 = clearingDatasetQuery3(dataset)

    q1 = query1(datasetQuery1)
    q2 = query2(datasetQuery2)
    q3 = query3(datasetQuery3)

    sensQ1 = sensitivityQuery1(datasetQuery1)
    sensQ2 = sensitivityQuery2(datasetQuery2)
    sensQ3 = sensitivityQuery3(datasetQuery3)

    for budget in budgets:

        randomized1 = randomizedQuery1(datasetQuery1, budget, sensQ1)
        randomized2 = randomizedQuery2(datasetQuery2, budget, sensQ2)
        randomized3 = randomizedQuery3(datasetQuery3, budget, sensQ3)

        print('Budget {} \n'.format(budget))

        print('Original query 1 = {} '.format(q1[0]))
        print('Randomized query 1 = {} '.format(randomized1[0][1]))
        print('Sensitivity query 1 = {} '.format(sensQ1))
        print('\n')

        print('Original query 2 = {} '.format(q2))
        print('Randomized query 2 = {} '.format(randomized2))
        print('Sensitivity query 2 = {} '.format(sensQ2))
        print('\n')

        print('Original query 3 = {} '.format(q3))
        print('Randomized query 3 = {} '.format(randomized3))
        print('Sensitivity query 3 = {} '.format(sensQ3))
        print('\n')

        resp2String = ""
        for i in randomized2.keys():
            resp2String += str( randomized2[i] ) + ","
            
        resp3String = ""
        print(randomized3)
        for i in randomized3:
            resp3String += str(i[0]) + ","
            
#         listabudget.append(budget)
#         listaR1.append(randomized1)
#         listaR2.append(resp2)
#         listaR3.append(resp3)
#         listaS1.append(sensQ1)
#         listaS2.append(sensQ2)
#         listaS3.append(sensQ3)
    
#     dicionario = {}
#     dicionario["budget"] = listabudget
#     dicionario["r1"] = listaR1
#     dicionario["r2"] = listaR2
#     dicionario["r3"] = listaR3
#     dicionario["s1"] = listaS1
#     dicionario["s2"] = listaS2
#     dicionario["s3"] = listaS3
    
#     resultado = pd.DataFrame.from_dict( dicionario )
#     resultado.to_csv("resultado.csv")
#             f.write("{},\t{},\t{},\t{}\t{}\t{}\t{}\n".format(budget,randomized1,resp2,randomized3,sensQ1,sensQ2,sensQ3))

        resultado.write("{};{};{};{};{};{};{}\n".format(budget,randomized1,resp2String,resp3String,sensQ1,sensQ2,sensQ3))
    resultado.close()

In [25]:
main('movie_metadata.csv', [0.1, 1, 10])

Budget 0.1 

Original query 1 = 760505847.0 
Randomized query 1 = The Musketeer  
Sensitivity query 1 = 760505847.0 


Original query 2 = {'English': (760505847.0, 'Avatar\xa0'), 'Mandarin': (128067808.0, 'Crouching Tiger, Hidden Dragon\xa0'), 'Aboriginal': (72515360.0, 'The Interpreter\xa0'), 'Spanish': (45356386.0, 'The Legend of Zorro\xa0'), 'French': (77413017.0, 'March of the Penguins\xa0'), 'Filipino': (10166502.0, 'The Great Raid\xa0'), 'Hindi': (13876974.0, 'Monsoon Wedding\xa0'), 'Maya': (50859889.0, 'Apocalypto\xa0'), 'Kazakh': (77231.0, 'Nomad: The Warrior\xa0'), 'Telugu': (6498000.0, 'Baahubali: The Beginning\xa0'), 'Cantonese': (32333860.0, 'Rumble in the Bronx\xa0'), 'Japanese': (15081783.0, 'Ponyo\xa0'), 'Aramaic': (499263.0, 'The Passion of the Christ\xa0'), 'Italian': (15091542.0, 'The Adventures of Pinocchio\xa0'), 'Dutch': (4398392.0, 'Black Book\xa0'), 'Dari': (15797907.0, 'The Kite Runner\xa0'), 'German': (11433134.0, 'Das Boot\xa0'), 'Hebrew': (2408553.0, 'The Gat

Budget 10 

Original query 1 = 760505847.0 
Randomized query 1 = Scary Movie 2  
Sensitivity query 1 = 760505847.0 


Original query 2 = {'English': (760505847.0, 'Avatar\xa0'), 'Mandarin': (128067808.0, 'Crouching Tiger, Hidden Dragon\xa0'), 'Aboriginal': (72515360.0, 'The Interpreter\xa0'), 'Spanish': (45356386.0, 'The Legend of Zorro\xa0'), 'French': (77413017.0, 'March of the Penguins\xa0'), 'Filipino': (10166502.0, 'The Great Raid\xa0'), 'Hindi': (13876974.0, 'Monsoon Wedding\xa0'), 'Maya': (50859889.0, 'Apocalypto\xa0'), 'Kazakh': (77231.0, 'Nomad: The Warrior\xa0'), 'Telugu': (6498000.0, 'Baahubali: The Beginning\xa0'), 'Cantonese': (32333860.0, 'Rumble in the Bronx\xa0'), 'Japanese': (15081783.0, 'Ponyo\xa0'), 'Aramaic': (499263.0, 'The Passion of the Christ\xa0'), 'Italian': (15091542.0, 'The Adventures of Pinocchio\xa0'), 'Dutch': (4398392.0, 'Black Book\xa0'), 'Dari': (15797907.0, 'The Kite Runner\xa0'), 'German': (11433134.0, 'Das Boot\xa0'), 'Hebrew': (2408553.0, 'The Gate