In [1]:
import pandas as pd

DBS_PATH = 'anon_data'

def save_solution(file_path, movies):
    with open(file_path, 'w') as f:
        for movie in movies:
            print(movie, file=f)
            
def load_oracle(file_path):
    movies = []
    with open(file_path, 'r') as f:
        movies = [movie.strip() for movie in f.readlines()]
    return sorted(movies)

def check_and_save_solution(solution, oracle_file_path, save_file_path):
    solution = sorted(solution)
    print('Movies rated by the target:', ', '.join(solution))
    oracle = load_oracle(oracle_file_path)
    print('\n')
    assert oracle == solution, 'Incorrect solution'
    print('Solution found successfully!')
    save_solution(save_file_path, solution)

*Disclaimer: Some variables may have same names in different solutions and therefore make sure to execute cells of a single solution one by one without executing another cell!!!*

# Exercise 1.1

In [2]:
target_email = 'donald.trump@whitehouse.gov'

local_db = pd.read_csv(f'{DBS_PATH}/com402-1.csv', header=None, names=['email_hash', 'movie_hash', 'date', 'rating'])
public_db = pd.read_csv(f'{DBS_PATH}/imdb-1.csv', header=None, names=['email', 'movie', 'date', 'rating'])

In [3]:
correlations = pd.merge(local_db, public_db, how='inner', on=['date', 'rating'])

movie_list = correlations['movie'].unique()
movie_hashes = {}

for movie in movie_list:
    possible_hashes = correlations.loc[correlations['movie'] == movie]['movie_hash'].unique().tolist()
    best_hash, count = '', 0
    for possible_movie_hash in possible_hashes:
        current_count = correlations.loc[
                (correlations['movie'] == movie) & (correlations['movie_hash'] == possible_movie_hash)].size
        if current_count > count:
            count = current_count
            best_hash = possible_movie_hash
    movie_hashes[movie] = best_hash

In [4]:
# Get ratings rated by the given user in the public database
target_public_ratings = public_db.loc[public_db['email'] == target_email]
# Find entries on same dates and with same ratings on local database
target_hash_arr = \
    pd.merge(local_db, target_public_ratings, how='inner', on=['date', 'rating'])['email_hash'].unique()

# ensure there is only 1 user in the given set and pick his email hash
assert len(target_hash_arr) == 1
target_hash = target_hash_arr[0]

rated_movies = local_db.loc[local_db['email_hash'] == target_hash]['movie_hash'].unique().tolist()
reversed_movie_hashes = {movie_hashes[k]: k for k in movie_hashes}


rated_movies_cleartext = [reversed_movie_hashes[movie_hash] for movie_hash in rated_movies]


check_and_save_solution(rated_movies_cleartext, 'real_data/user-1.csv', 'real_data/solution-1.csv')

Movies rated by the target: 12 Angry Men, Modern Times, North by Northwest, Once Upon a Time in the West, Pulp Fiction, Raiders of the Lost Ark, Seven Samurai, The Dark Knight, The Shawshank Redemption, Tokyo Story


Solution found successfully!


# Exercise 1.2

In [5]:
local_db = pd.read_csv(f'{DBS_PATH}/com402-2.csv', header=None, names=['email_hash', 'movie_hash', 'date', 'rating'])
public_db = pd.read_csv(f'{DBS_PATH}/imdb-2.csv', header=None, names=['email', 'movie', 'date', 'rating'])

In [6]:
def normalize(col):
    return (col - col.mean()) / col.std()

def minmax_normalize(col):
    return (col - col.min()) / (col.max() - col.min())

In [7]:
local_movie_freq = local_db.groupby(['movie_hash']).size().reset_index(name='freq').sort_values(by='freq')
local_movie_freq['freq'] = normalize(local_movie_freq['freq'])

public_movie_freq = public_db.groupby(['movie']).size().reset_index(name='freq').sort_values(by='freq')
public_movie_freq['freq'] = normalize(public_movie_freq['freq'])

In [8]:
movie_hash_correlation = list(zip(list(local_movie_freq['movie_hash'].unique()), list(public_movie_freq['movie'].unique())))
movie_to_hash_mapping = { k: v for v, k in movie_hash_correlation }
hash_to_movie_mapping = { k: v for k, v in movie_hash_correlation }

In [9]:
TARGET_EMAIL = 'donald.trump@whitehouse.gov'
movies_rated_by_target = public_db.loc[public_db['email'] == TARGET_EMAIL]['movie'].unique()

rated_movie_hashes = set([movie_to_hash_mapping[m] for m in movies_rated_by_target])

In [10]:
email_hashes = local_db['email_hash'].unique()
candidates = []
for email_hash in email_hashes:
    rated_movies_by_email_hash = set(local_db.loc[local_db['email_hash'] == email_hash]['movie_hash'].unique())
    if rated_movie_hashes.issubset(rated_movies_by_email_hash):
        candidates.append(email_hash)
assert len(candidates) == 1, 'No unique candidate or none at all'
target_email_hash = candidates[0]

local_rated_movies_hashes = local_db.loc[local_db['email_hash'] == target_email_hash]['movie_hash'].unique()
local_rated_movies = [hash_to_movie_mapping[mh] for mh in local_rated_movies_hashes]
rated_movies_cleartext = sorted(local_rated_movies)

check_and_save_solution(rated_movies_cleartext, 'real_data/user-2.csv', 'real_data/solution-2.csv')

Movies rated by the target: 12 Angry Men, Casablanca, Citizen Kane, Double Indemnity, Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb, Lawrence of Arabia, Modern Times, Psycho, Raiders of the Lost Ark, Rashomon, Schindler's List, Seven Samurai, Singin' in the Rain, Spirited Away, Star Wars: Episode V - The Empire Strikes Back, Sunset Blvd., Taxi Driver, The Dark Knight, The Godfather, The Godfather: Part II


Solution found successfully!


# Exercise 1.3

In [11]:
TARGET_EMAIL = 'donald.trump@whitehouse.gov'
local_db = pd.read_csv(f'{DBS_PATH}/com402-3.csv', header=None, names=['email_hash', 'movie_hash', 'date', 'rating'])
public_db = pd.read_csv(f'{DBS_PATH}/imdb-3.csv', header=None, names=['email', 'movie', 'date', 'rating'])

In [12]:
# import collections

# class Multiset(object):
#     def __init__(self, iterable):
#         self.counter = collections.Counter(iterable)
        
#     def __repr__(self):
#         return str(self.counter)
    
#     def issubmultiset(self, other):
#         for k in self.counter.keys():
#             if k not in other.counter or self.counter[k] > other.counter[k]:
#                 return False
#         return True

emails = public_db['email'].unique()
# email_ratings = {
#     email: Multiset(list(public_db.loc[public_db['email'] == email]['rating']))\
#     for email in emails
# }

email_hashes = local_db['email_hash'].unique()
# email_hash_ratings = {
#     email_hash: Multiset(list(local_db.loc[local_db['email_hash'] == email_hash]['rating']))\
#     for email_hash in email_hashes
# }

# Ineffective for mapping mails to hashes as there are around 100 to 150 possible solutions for each
# email this way
#for e, r in email_ratings.items():
#    k = 0
#    for eh, hr in email_hash_ratings.items():
#        if r.issubmultiset(hr):
#            k += 1
#    print(k)
#list(public_db.loc[public_db['email'] == TARGET_EMAIL]['date'])

In [13]:
# from datetime import datetime

# d1 = datetime.strptime('31/03/15', '%d/%m/%y')
# d2 = datetime.strptime('25/03/15', '%d/%m/%y')
# (d2 - d1).days

In [14]:
# Importing datetime.
from datetime import datetime

def date_comparator(d1, d2):
    return (d1 - d2).days

class ConditionedDateSet:
    def __init__(self, entries, comparator):
        self.data = sorted([datetime.strptime(entry, '%d/%m/%y') for entry in entries])
        self.comparator = comparator

    def __repr__(self):
        return str(self.data)
        
    def check_overlaps_left(self, other):
        MAX_POSSIBLE_DISTANCE = 57 # might be +- 1 off
        k = 0
        result = []
        max_len = len(other.data)
        for i, v in enumerate(self.data):
            while k < max_len and self.comparator(v, other.data[k]) > MAX_POSSIBLE_DISTANCE:
                k += 1
            current = k
            overlaps = []
            while current < max_len and self.comparator(other.data[current], v) <= MAX_POSSIBLE_DISTANCE:
                overlaps.append(other.data[current])
                current += 1
            if not overlaps:
                #print(current, k, other.data[current], v, self.comparator(other.data[current], v))
                return None
            result.append(overlaps)
        return result

    
email_dates = {
    email: ConditionedDateSet(
        public_db.loc[public_db['email'] == email]['date'],
        date_comparator)\
    for email in emails
}

email_hashes_dates = {
    email_hash: ConditionedDateSet(
        local_db.loc[local_db['email_hash'] == email_hash]['date'],
        date_comparator)\
    for email_hash in email_hashes
}

pairs = []

while len(email_hashes_dates) > 0:
    found = []
    for email, cdset in email_dates.items():
        matches = []
        for email_hash, hashes_cdset in email_hashes_dates.items():
            if cdset.check_overlaps_left(hashes_cdset) is not None:
                matches.append(email_hash)
        if len(matches) == 0:
            print('Oh no!')
            raise ValueError('Algorithm failed')
        elif len(matches) == 1:
            del email_hashes_dates[matches[0]]
            found.append(email)
            pairs.append((email, matches[0]))
    for email in found:
        del email_dates[email]

email_to_hash_mapping = { k: v for k, v in pairs }
hash_to_email_mapping = { k: v for v, k in pairs }

In [15]:
movie_raters = {
    movie: set(public_db.loc[public_db['movie'] == movie]['email'].unique())\
    for movie in public_db['movie'].unique()
}

hash_movie_raters = {
    hash_movie: set([ hash_to_email_mapping[email_hash] for email_hash in local_db.loc[local_db['movie_hash'] == hash_movie]['email_hash'].unique()])\
    for hash_movie in local_db['movie_hash'].unique()
}


movie_hash_pairs = []
while len(hash_movie_raters) > 0:
    found = []
    for movie, mv in movie_raters.items():
        matches = []
        for movie_hash, mhv in hash_movie_raters.items():
            if mv.issubset(mhv):
                matches.append(movie_hash)
        assert len(matches) > 0, 'Algorithm faulty'
        if len(matches) == 1:
            found.append(movie)
            movie_hash_pairs.append((movie, matches[0]))
            del hash_movie_raters[matches[0]]
    for movie in found:
        del movie_raters[movie]
        
movie_to_hash_mapping = { k: v for k, v in movie_hash_pairs }
hash_to_movie_mapping = { k: v for v, k in movie_hash_pairs }

In [16]:
movies_hashes_rated_by_the_target = \
    local_db.loc[local_db['email_hash'] == email_to_hash_mapping[TARGET_EMAIL]]['movie_hash'].unique()
movies_rated_by_the_target = sorted([
    hash_to_movie_mapping[movie_hash]\
    for movie_hash in movies_hashes_rated_by_the_target
])

check_and_save_solution(movies_rated_by_the_target,'real_data/user-3.csv', 'real_data/solution-3.csv')

Movies rated by the target: 12 Angry Men, A Clockwork Orange, A Hard Day's Night, A Man Escaped, A Separation, A Streetcar Named Desire, Amadeus, Amelie, American Beauty, Amores perros, Anatomy of a Murder, Before Sunrise, Being There, Bicycle Thieves, Blade Runner, Blue Velvet, Boyhood, Chungking Express, Cinema Paradiso, Citizen Kane, City Lights, City of God, Die Hard, Do the Right Thing, Double Indemnity, Fight Club, Finding Nemo, Frankenstein, Full Metal Jacket, Gone with the Wind, Goodfellas, Grave of the Fireflies, Hannah and Her Sisters, Harakiri, Harold and Maude, Heat, High Noon, Howl's Moving Castle, Ikiru, In the Mood for Love, Indiana Jones and the Last Crusade, La Dolce Vita, La Grande Illusion, La Haine, Laura, Le Samoura, Life of Brian, Los Olvidados, M, Memento, Metropolis, Modern Times, North by Northwest, Notorious, On the Waterfront, Persepolis, Persona, Princess Mononoke, Psycho, Rear Window, Rebecca, Repulsion, Reservoir Dogs, Rififi, Roman Holiday, Rope, Schindle