In [15]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors
from sklearn.manifold import MDS, Isomap
from sklearn.decomposition import PCA
import pickle

In [16]:
df = pd.read_csv("data/IMDb title_principals.csv")

In [17]:
len(df)

835513

In [18]:
import os

def run_or_load_pickle(pickle_path: str):
    def decorator(fn):
        def wrapped_fn():
            if os.path.exists(pickle_path):
                with open(pickle_path, "rb") as fd:
                    return pickle.load(fd)
            else:
                res = fn()
                with open(pickle_path, "wb") as fd:
                    pickle.dump(res, fd)
                return res
        return wrapped_fn
    return decorator

In [19]:
film_to_id = {film: index for index, film in enumerate(set(df["imdb_title_id"]))}
person_to_id = {person: index for index, person in enumerate(set(df["imdb_name_id"]))}
print(f"Film-People correspondance matrix is: {len(film_to_id)} x {len(person_to_id)}")

Film-People correspondance matrix is: 85848 x 297706


In [21]:
rows = []
cols = []
data = []

for _, row in df.iterrows():
    title_id = film_to_id[row["imdb_title_id"]]
    person_id = person_to_id[row["imdb_name_id"]]
    rows.append(title_id)
    cols.append(person_id)
    data.append(1)

In [22]:
cc_matrix = csr_matrix((data, (rows, cols)), shape=(len(film_to_id), len(person_to_id)))
del rows
del cols
del data

In [23]:
@run_or_load_pickle("svd_matrix.pkl")
def construct_svd():
    svd_n_components = 300
    return TruncatedSVD(svd_n_components).fit_transform(cc_matrix)

svd_reduced_matrix = construct_svd()

In [24]:
svd_reduced_matrix.shape

(85848, 300)

In [25]:
# PCA
@run_or_load_pickle("pca_matrix_100.pkl")
def construct_pca_from_svd():
    pca = PCA(n_components=100)
    return pca.fit_transform(svd_reduced_matrix)

pca_reduced_matrix = construct_pca_from_svd()
id_to_film = {v: k for k, v in film_to_id.items()}

with open("pca_vectors", "w") as fd:
    for i, vector in enumerate(pca_reduced_matrix):
        fd.write(f"{id_to_film[i]}, {list(vector)}\n")

In [26]:
len(id_to_film)

85848

In [27]:
# MDS
@run_or_load_pickle("mds_matrix.pkl")
def construct_mds_matrix_from_svd():
    mds = MDS(n_components=50)
    return mds.fit_transform(svd_reduced_matrix)

mds_vectors = construct_mds_matrix_from_svd()

MemoryError: Unable to allocate 54.9 GiB for an array with shape (85848, 85848) and data type float64

In [28]:
def read_vectors(file: str):
    result = {}

    with open(file, "r") as fd:
        for line in fd:
            line = line.strip()
            film_id = line[:9]
            vector = np.array(eval(line[11:]))
            result[film_id] = vector
    return result

In [29]:
pca_vectors = read_vectors("pca_vectors")

In [30]:
ft_vectors = read_vectors("fasttext-vecs")

In [31]:
len(pca_vectors)

85831

In [32]:
len(ft_vectors)

83726

In [34]:
min_pca_dist = 100
max_pca_dist = -1

random_key = "tt0000009"
dist_sum = 0

for k, v in pca_vectors.items():
    if k == random_key:
        continue
    dist = np.linalg.norm(5 * pca_vectors[random_key] - 5 * v)
    dist_sum += dist
    
    if dist > max_pca_dist:
        max_pca_dist = dist
    elif dist < min_pca_dist:
        min_pca_dist = dist

(min_pca_dist, max_pca_dist, dist_sum / len(pca_vectors.keys()))

(0.19900813544405369, 14.385001867252393, 1.3503293371954248)

In [35]:
np.linalg.norm(ft_vectors["tt0000009"] - ft_vectors["tt0006688"])

2.665364904948882

min, max, mean  
pca_vectors - (0.21711308040945065, 2.899750199701518, 0.44720423651610886)  
fasttext_vectors - (1.3741962733866349, 10.816168600184021, 2.694827573044851)

In [36]:
common_keys = set(ft_vectors.keys()) & set(pca_vectors.keys())

In [40]:
concated_vectors = {k: np.concatenate((ft_vectors[k], pca_vectors[k] * 7)) for k in common_keys}
print(list(concated_vectors.values())[0])

with open("concated_vectors", "w") as fd:
    for i, vector in enumerate(pca_reduced_matrix):
        fd.write(f"{id_to_film[i]}, {list(vector)}\n")

[ 2.92618720e-01  1.98684250e-01 -3.11834370e-01 -3.03028020e-01
  3.35028800e-01  2.63366640e-01 -1.54859570e-01  2.31655080e-01
  1.77757040e-01 -2.40785030e-01  7.17355400e-02 -3.27741130e-02
 -1.83749880e-01 -1.50687350e-01 -1.05583206e-01  7.09494050e-02
 -2.14024860e-02 -3.27468520e-03 -5.88776700e-03  9.45766700e-02
 -2.82795100e-01 -4.72205300e-03 -1.92129920e-01 -2.16942270e-01
 -4.41602800e-01 -2.00733960e-01 -6.92403900e-02  2.03183380e-01
  1.22237390e-02  2.07359570e-01 -2.00125950e-01 -4.52278620e-05
  2.59832920e-01 -1.00132440e-01  4.09158860e-01  1.29064170e-01
  1.84345890e-01 -3.31474650e-02  2.14008780e-01  1.98372480e-01
 -8.18588400e-02 -1.32584570e-01  1.25983950e-01  2.24660260e-01
 -9.02927740e-02 -2.19351320e-01  1.46017490e-01 -1.16165526e-01
  6.04518840e-02 -1.49927530e-01  2.84777020e-01  4.70613400e-01
 -3.35367140e-01  4.56147270e-02  1.60021100e-01 -1.12727920e-01
 -6.04105670e-02 -6.62006500e-02  1.36781450e-01  4.38758050e-01
 -1.52980720e-01 -1.08504

In [247]:
cc_film_ids = []
cc_film_vecs = []

for k, v in concated_vectors.items():
    cc_film_ids.append(k)
    cc_film_vecs.append(v)

cc_film_vecs = np.array(cc_film_vecs)

In [248]:
name_to_id = {}
id_to_name = {}
id_to_ind = {film_id: i for i, film_id in enumerate(cc_film_ids)}

names_df = pd.read_csv("data/IMDb movies.csv")

for k, row in names_df.iterrows():
    id_to_name[row["imdb_title_id"]] = row["title"]
    name_to_id[row["title"]] = row["imdb_title_id"]

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [249]:
from sklearn.neighbors import NearestNeighbors

neighbors = NearestNeighbors(n_neighbors=10)
neighbors.fit(cc_film_vecs)

NearestNeighbors(n_neighbors=10)

In [250]:
len(cc_film_vecs)

83717

In [251]:
dists, indexes = neighbors.kneighbors([cc_film_vecs[id_to_ind["tt0322259"]]])
indexes

array([[41189, 71455, 77521, 50525, 48889, 42029, 17223, 24807,  3689,
        34138]], dtype=int64)

In [252]:
film_ids[10]

'tt0000574'

In [253]:
for index in indexes[0]:
    print(id_to_name[cc_film_ids[index]], cc_film_ids[index])

2 Fast 2 Furious tt0322259
L'ultimo appello tt0115862
Soldato sotto la pioggia tt0057517
Chang jiang tu tt5470448
Chocolate City: Vegas tt5485482
Settimo velo tt0038924
Un sudista del Nord tt0040825
Dropa tt1822395
Just Pals tt0011358
Homeless tt3140044


Interstellar
Beacon77
Gatti rossi in un labirinto di vetro
Khadak
A Perfect Man
Heavy Times
Pass Thru
South from Granada
Il silenzio
Atto d'amore  


