# Preparation

In [2]:
import pandas as pd
import numpy as np
import msgpack

In [3]:
with open('reviewers.msgpack', 'rb') as reviewers_file:
    reviewers_data = msgpack.load(reviewers_file)
    
with open('reviews.msgpack', 'rb') as reviews_file:
    reviews_data = msgpack.load(reviews_file)

In [4]:
reviewers_data.append({
    b'is_publication': False,
    b'key': b'swarmer',
    b'name': b'Anton Barkovsky',
    b'publication_link': None,
    b'publication_title': None,
})

my_reviews = {
    'blade-runner-2049': 100,
    'baby-driver': 85,
    'dunkirk': 80,
    'loveless-2017': 95,
    'kiss-kiss-bang-bang': 80,
    'zero-dark-thirty': 85,
    'sicario': 100,
    'rogue-one': 90,
    'the-prestige': 90,
    'the-martian': 90,
    'the-big-lebowski': 90,
    'gran-torino': 90,
    'citizenfour': 90,
    'snowden': 80,
    'arrival': 80,
    'mulholland-dr': 80,
    'the-danish-girl': 70,
    'the-theory-of-everything': 80,
    'the-big-short': 90,
    'edge-of-tomorrow': 80,
    'carol': 90,
    'drive': 85,
    'warcraft': 80,
    'a-clockwork-orange': 80,
    'the-hateful-eight': 80,
    'apocalypse-now': 90,
    'the-descendants': 80,
    'the-social-network': 85,
    'star-wars-episode-vii---the-force-awakens': 80,
    'the-best-offer': 70,
    'in-the-loop': 80,
    'fight-club': 80,
    'batman-begins': 80,
    'the-fault-in-our-stars': 80,
    'the-spectacular-now': 70,
    'children-of-men': 90,
    'ex-machina': 90,
    'the-kings-speech': 90,
    'the-imitation-game': 80,
    'what-we-do-in-the-shadows': 80,
    'up-in-the-air': 70,
    'argo': 90,
    'interstellar': 85,
    'guardians-of-the-galaxy': 70,
    'inglourious-basterds': 80,
    'the-avengers-2012': 70,
    'serenity': 80,
    '5050': 70,
    'hot-fuzz': 90,
    'her': 90,
    'moon': 90,
    'about-time': 80,
    'the-hurt-locker': 100,
    'silver-linings-playbook': 80,
    'the-hunger-games-catching-fire': 80,
    'american-hustle': 70,
    'the-wolf-of-wall-street': 80,
    'dr-strangelove-or-how-i-learned-to-stop-worrying-and-love-the-bomb': 100,
    'blade-runner': 85,
    'the-perks-of-being-a-wallflower': 80,
    'the-lives-of-others': 100,
    'its-a-wonderful-life': 90,
    'the-dark-knight': 90,
    'pulp-fiction': 90,
    'star-wars-episode-iv---a-new-hope': 80,
    'the-godfather': 90,
    'inception': 100,
    'forrest-gump': 90,
    'star-wars-episode-vi---return-of-the-jedi': 80,
    'the-lord-of-the-rings-the-fellowship-of-the-ring': 70,
    'pirates-of-the-caribbean-the-curse-of-the-black-pearl': 80,
    'the-matrix': 90,
    'star-wars-episode-v---the-empire-strikes-back': 80,
    'gladiator': 100,
    'the-godfather-part-ii': 90,
    'black-swan': 80,
    'the-lord-of-the-rings-the-return-of-the-king': 70,
    'eternal-sunshine-of-the-spotless-mind': 80,
    'the-good-the-bad-and-the-ugly-re-release': 90,
    'the-lord-of-the-rings-the-two-towers': 70,
    'amelie': 90,
}
reviews_data.extend([
    {
        b'date': None,
        b'film': key.encode('utf-8'),
        b'movie_link': None,
        b'movie_title': None,
        b'pub_title': None,
        b'review_link': None,
        b'reviewer': b'swarmer',
        b'score': str(score).encode('utf-8'),
    }
    for key, score in my_reviews.items()
])

In [5]:
reviewers = sorted(set(
    reviewer[b'key'].decode('utf-8')
    for reviewer in reviewers_data
    if not reviewer[b'is_publication']
))
reviewers_index = {key: i for i, key in enumerate(reviewers)}

In [6]:
swarmer_index = reviewers_index['swarmer']

In [7]:
films = sorted(set(review[b'film'].decode('utf-8') for review in reviews_data))
films_index = {key: i for i, key in enumerate(films)}

In [8]:
matrix = np.empty((len(films), len(reviewers)))
matrix[:] = np.nan

In [9]:
vals, counts = numpy.unique(matrix, return_counts=True, axis=None)

In [10]:
for review in reviews_data:
    reviewer_key = review[b'reviewer'].decode('utf-8')
    if reviewer_key not in reviewers_index:
        continue
    
    film_row = films_index[review[b'film'].decode('utf-8')]
    reviewer_col = reviewers_index[reviewer_key]
    matrix[film_row, reviewer_col] = float(review[b'score'].decode('utf-8'))

In [11]:
matrix_df = pd.DataFrame(matrix)

# Similar reviewers

In [78]:
reviewer_correlation_matrix = matrix_df.corr(min_periods=10)

In [79]:
top_reviewer_corrs = reviewer_correlation_matrix[swarmer_index].nlargest(10)
top_reviewer_corrs.index = top_reviewer_corrs.index.map(lambda i: reviewers[i])
top_reviewer_corrs

swarmer                 1.000000
rick-groen              0.636128
carrie-rickey           0.579271
calvin-wilson           0.575007
dan-jolin               0.555556
justin-chang            0.434875
elizabeth-weitzman      0.432855
marjorie-baumgarten     0.408314
kyle-smith              0.408282
screen-international    0.406197
Name: 784, dtype: float64

In [80]:
def common_films(rkey1, rkey2):
    rid1, rid2 = reviewers_index[rkey1], reviewers_index[rkey2]
    
    col1 = matrix[:, rid1]
    col2 = matrix[:, rid2]
    for i, (score1, score2) in enumerate(zip(col1, col2)):
        if np.isnan(score1) or np.isnan(score2):
            continue
        
        print(f'{films[i]}: {reviewers[rid1]}={score1}, {reviewers[rid2]}={score2}')

In [82]:
common_films('swarmer', 'carrie-rickey')

batman-begins: swarmer=80.0, carrie-rickey=63.0
drive: swarmer=85.0, carrie-rickey=75.0
gran-torino: swarmer=90.0, carrie-rickey=75.0
in-the-loop: swarmer=80.0, carrie-rickey=75.0
pirates-of-the-caribbean-the-curse-of-the-black-pearl: swarmer=80.0, carrie-rickey=75.0
the-dark-knight: swarmer=90.0, carrie-rickey=75.0
the-kings-speech: swarmer=90.0, carrie-rickey=100.0
the-lives-of-others: swarmer=100.0, carrie-rickey=100.0
the-lord-of-the-rings-the-two-towers: swarmer=70.0, carrie-rickey=75.0
the-perks-of-being-a-wallflower: swarmer=80.0, carrie-rickey=75.0
the-social-network: swarmer=85.0, carrie-rickey=100.0


# Similar films

In [18]:
matrix_df_t = matrix_df.transpose()

In [72]:
film_correlation_matrix = matrix_df_t.corr(min_periods=20)

In [73]:
stacked = film_correlation_matrix.stack()
stacked = stacked[stacked.index]

In [74]:
stacked = stacked[stacked.index.get_level_values(0) < stacked.index.get_level_values(1)]

In [75]:
top_film_corrs = stacked[stacked != 1.0].nlargest(20)

In [76]:
top_film_corrs.index = top_film_corrs.index.map(lambda i: (films[i[0]], films[i[1]]))
top_film_corrs

gran-torino                                     mystic-river                                      0.884318
nymphomaniac-volume-i                           nymphomaniac-volume-two                           0.871668
master-and-commander-the-far-side-of-the-world  vera-drake                                        0.845615
the-bourne-legacy                               this-means-war                                    0.830082
4-months-3-weeks-and-2-days                     no-country-for-old-men                            0.826628
good-night-and-good-luck                        master-and-commander-the-far-side-of-the-world    0.826379
haywire                                         killing-them-softly                               0.818191
blue-jasmine                                    spring-breakers                                   0.809869
iron-man                                        kill-bill-vol-2                                   0.804650
blood-work                           

# Dissimilar films

In [77]:
bottom_film_corrs = stacked[stacked != 1.0].nsmallest(20)
bottom_film_corrs.index = bottom_film_corrs.index.map(lambda i: (films[i[0]], films[i[1]]))
bottom_film_corrs

lincoln                                unbroken                                             -0.762448
gravity                                the-counselor                                        -0.762395
one-day-2011                           the-master                                           -0.728714
jane-eyre                              the-avengers-2012                                    -0.720123
brave                                  the-adjustment-bureau                                -0.695962
blue-jasmine                           john-carter-of-mars                                  -0.670114
hacksaw-ridge                          silence                                              -0.663293
atonement                              infamous                                             -0.663236
eternal-sunshine-of-the-spotless-mind  indiana-jones-and-the-kingdom-of-the-crystal-skull   -0.642116
melancholia                            the-conspirator                            