# Preparation

In [2]:
import pandas as pd
import numpy as np
import msgpack

In [107]:
with open('reviewers.msgpack', 'rb') as reviewers_file:
    reviewers_data = msgpack.load(reviewers_file)
    
with open('reviews.msgpack', 'rb') as reviews_file:
    reviews_data = msgpack.load(reviews_file)

In [108]:
reviewers_data.append({
    b'is_publication': False,
    b'key': b'swarmer',
    b'name': b'Anton Barkovsky',
    b'publication_link': None,
    b'publication_title': None,
})

my_reviews = {
    'blade-runner-2049': 100,
    'baby-driver': 85,
    'dunkirk': 80,
    'loveless-2017': 95,
    'kiss-kiss-bang-bang': 80,
    'zero-dark-thirty': 85,
    'sicario': 100,
    'rogue-one': 90,
    'the-prestige': 90,
    'the-martian': 90,
    'the-big-lebowski': 90,
    'gran-torino': 90,
    'citizenfour': 90,
    'snowden': 80,
    'arrival': 80,
    'mulholland-dr': 80,
    'the-danish-girl': 70,
    'the-theory-of-everything': 80,
    'the-big-short': 90,
    'edge-of-tomorrow': 80,
    'carol': 90,
    'drive': 85,
    'warcraft': 80,
    'a-clockwork-orange': 80,
    'the-hateful-eight': 80,
    'apocalypse-now': 90,
    'the-descendants': 80,
    'the-social-network': 85,
    'star-wars-episode-vii---the-force-awakens': 80,
    'the-best-offer': 70,
    'in-the-loop': 80,
    'fight-club': 80,
    'batman-begins': 80,
    'the-fault-in-our-stars': 80,
    'the-spectacular-now': 70,
    'children-of-men': 90,
    'ex-machina': 90,
    'the-kings-speech': 90,
    'the-imitation-game': 80,
    'what-we-do-in-the-shadows': 80,
    'up-in-the-air': 70,
    'argo': 90,
    'interstellar': 85,
    'guardians-of-the-galaxy': 70,
    'inglourious-basterds': 80,
    'the-avengers-2012': 70,
    'serenity': 80,
    '5050': 70,
    'hot-fuzz': 90,
    'her': 90,
    'moon': 90,
    'about-time': 80,
    'the-hurt-locker': 100,
    'silver-linings-playbook': 80,
    'the-hunger-games-catching-fire': 80,
    'american-hustle': 70,
    'the-wolf-of-wall-street': 80,
    'dr-strangelove-or-how-i-learned-to-stop-worrying-and-love-the-bomb': 100,
    'blade-runner': 85,
    'the-perks-of-being-a-wallflower': 80,
    'the-lives-of-others': 100,
    'its-a-wonderful-life': 90,
    'the-dark-knight': 90,
    'pulp-fiction': 90,
    'star-wars-episode-iv---a-new-hope': 80,
    'the-godfather': 90,
    'inception': 100,
    'forrest-gump': 90,
    'star-wars-episode-vi---return-of-the-jedi': 80,
    'the-lord-of-the-rings-the-fellowship-of-the-ring': 70,
    'pirates-of-the-caribbean-the-curse-of-the-black-pearl': 80,
    'the-matrix': 90,
    'star-wars-episode-v---the-empire-strikes-back': 80,
    'gladiator': 100,
    'the-godfather-part-ii': 90,
    'black-swan': 80,
    'the-lord-of-the-rings-the-return-of-the-king': 70,
    'eternal-sunshine-of-the-spotless-mind': 80,
    'the-good-the-bad-and-the-ugly-re-release': 90,
    'the-lord-of-the-rings-the-two-towers': 70,
    'amelie': 90,
}
reviews_data.extend([
    {
        b'date': None,
        b'film': key.encode('utf-8'),
        b'movie_link': None,
        b'movie_title': None,
        b'pub_title': None,
        b'review_link': None,
        b'reviewer': b'swarmer',
        b'score': str(score).encode('utf-8'),
    }
    for key, score in my_reviews.items()
])

In [113]:
reviewers = sorted(set(
    reviewer[b'key'].decode('utf-8')
    for reviewer in reviewers_data
    if not reviewer[b'is_publication']
))
reviewers_index = {key: i for i, key in enumerate(reviewers)}

In [114]:
swarmer_index = reviewers_index['swarmer']

In [115]:
films = sorted(set(review[b'film'].decode('utf-8') for review in reviews_data))
films_index = {key: i for i, key in enumerate(films)}

In [116]:
matrix = np.empty((len(films), len(reviewers)))
matrix[:] = np.nan

In [117]:
vals, counts = numpy.unique(matrix, return_counts=True, axis=None)
#sorted(zip(vals, counts), reverse=True)[:40]

In [118]:
for review in reviews_data:
    reviewer_key = review[b'reviewer'].decode('utf-8')
    if reviewer_key not in reviewers_index:
        continue
    
    film_row = films_index[review[b'film'].decode('utf-8')]
    reviewer_col = reviewers_index[reviewer_key]
    matrix[film_row, reviewer_col] = float(review[b'score'].decode('utf-8'))

In [119]:
matrix_df = pd.DataFrame(matrix)

# Similar reviewers

In [145]:
reviewer_correlation_matrix = matrix_df.corr(min_periods=5)

In [146]:
reviewer_correlation_matrix[swarmer_index].nlargest(10)

784    1.000000
577    0.863433
381    0.836798
183    0.828850
469    0.738549
234    0.672206
795    0.661896
576    0.646781
363    0.641873
679    0.636128
Name: 784, dtype: float64

In [138]:
def common_films(rid1, rid2):
    col1 = matrix[:, rid1]
    col2 = matrix[:, rid2]
    for i, (score1, score2) in enumerate(zip(col1, col2)):
        if np.isnan(score1) or np.isnan(score2):
            continue
        
        print(f'{films[i]}: {reviewers[rid1]}={score1}, {reviewers[rid2]}={score2}')

In [149]:
reviewers[577], reviewers[381], reviewers[183]

('mike-dangelo', 'john-bleasdale', 'dave-calhoun')

In [155]:
common_films(784, 576)

batman-begins: swarmer=80.0, mike-clark=63.0
forrest-gump: swarmer=90.0, mike-clark=88.0
kiss-kiss-bang-bang: swarmer=80.0, mike-clark=63.0
pulp-fiction: swarmer=90.0, mike-clark=100.0
the-matrix: swarmer=90.0, mike-clark=63.0


# Similar films

In [156]:
matrix_df_t = matrix_df.transpose()

In [236]:
film_correlation_matrix = matrix_df_t.corr(min_periods=10)

In [173]:
stacked = film_correlation_matrix.stack()

In [182]:
stacked[stacked != 1.0].nlargest(20)

4680   7697     0.884318
7697   4680     0.884318
7994   7995     0.871668
7995   7994     0.871668
7083   13930    0.845615
13930  7083     0.845615
10950  13299    0.830082
13299  10950    0.830082
146    7875     0.826628
7875   146      0.826628
4639   7083     0.826379
7083   4639     0.826379
4947   6154     0.818191
6154   4947     0.818191
1808   10103    0.809869
10103  1808     0.809869
5729   6125     0.804650
6125   5729     0.804650
1786   13234    0.803747
13234  1786     0.803747
dtype: float64

In [184]:
films[6125], films[5729]

('kill-bill-vol-2', 'iron-man')

In [185]:
stacked[stacked != 1.0].nsmallest(20)

6581   13726   -0.762448
13726  6581    -0.762448
4697   11147   -0.762395
11147  4697    -0.762395
8123   12173   -0.728714
12173  8123    -0.728714
5837   10795   -0.720123
10795  5837    -0.720123
1950   10696   -0.695962
10696  1950    -0.695962
1808   5937    -0.670114
5937   1808    -0.670114
4789   9735    -0.663293
9735   4789    -0.663293
1159   5625    -0.663236
5625   1159    -0.663236
3727   5615    -0.642116
5615   3727    -0.642116
7186   11135   -0.639671
11135  7186    -0.639671
dtype: float64

In [188]:
films[8123], films[12173]

('one-day-2011', 'the-master')

In [192]:
my_film_ids = [films_index[key] for key in my_reviews]
my_matrix_df_t = matrix_df.iloc[my_film_ids].transpose()

In [218]:
my_film_correlation_matrix = my_matrix_df_t.corr(min_periods=10)
stacked = my_film_correlation_matrix.stack()

In [228]:
stacked[stacked != 1.0].nsmallest(20)

2519   3727    -0.887611
3727   2519    -0.887611
5213   1715    -0.778313
1715   5213    -0.778313
5583   7442    -0.701804
7442   5583    -0.701804
11239  5213    -0.646077
5213   11239   -0.646077
3498   8539    -0.614680
8539   3498    -0.614680
11681  3799    -0.606280
3799   11681   -0.606280
       5633    -0.596785
5633   3799    -0.596785
11189  1715    -0.570400
1715   11189   -0.570400
6192   3727    -0.563624
3727   6192    -0.563624
12908  14248   -0.554930
14248  12908   -0.554930
dtype: float64

In [234]:
films[5583], films[12908]

('in-the-loop', 'the-theory-of-everything')

In [237]:
film_correlation_matrix[5583].dropna().nlargest()

5583    1.000000
7134    0.857111
8310    0.856141
7875    0.851287
2438    0.837438
Name: 5583, dtype: float64

In [238]:
films[7134]

'me-and-orson-welles'