In [1]:
## Imports
import pandas as pd
import numpy as np
import networkx as nx
import pickle

In [2]:
## Read data
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [13]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [3]:
## Genre similarity function
def genre_simi(a, b):
        overlap = 0
        for gens in a:
            if gens in b:
                overlap += 1
        all_included = len(np.unique(np.array(a + b)))
        sim = overlap / all_included

        return sim

In [16]:
## Movie relation graph
mov_graph = nx.Graph(name='Movie Similarity Graph')
for i in range(len(movies)):
    gen = movies.iloc[i, 2]
    gen_type = gen.split('|')
    for j in range(i + 1, len(movies)):
        genn = movies.iloc[j, 2]
        genn_type = genn.split('|')
        sim = genre_simi(gen_type, genn_type)
        if sim != 0.0:
            mov_graph.add_edge(movies.iloc[i, 0], movies.iloc[j, 0], genre_similarity=sim)

print(mov_graph)

Graph named 'Movie Similarity Graph' with 9742 nodes and 21194805 edges


In [28]:
## Save the graph to local
pickle.dump(mov_graph, open('Movie_Similarity_Graph.pickle', 'wb'))

In [4]:
movi_graph = pickle.load(open('Movie_Similarity_Graph.pickle', 'rb'))

In [None]:
## Weighted PageRank
pr = nx.pagerank(movi_graph, alpha=0.85, personalization=person)

In [11]:
## Similarity calculation function
def eud(base, x, y):
    x_r = base.loc[x].values
    y_r = base.loc[y].values
    eudist = 1 / (1 + np.linalg.norm(x_r, y_r))
    return eudist

In [12]:
## NaN detection function
def de_nan(base, r):
    ori = base.loc[r].index.to_list()
    cle = base.loc[r].dropna().index.to_list()
    for dex in cle:
        ori.remove(dex)
    return ori

In [13]:
## Reconstruction function
def recons(new_data, template):
    temp = template
    for u in range(len(new_data)):
        for i in range(len(new_data[u][0])):
            temp.at[u+1, new_data[u][0][i]] = new_data[u][1][i]
    return temp

In [18]:
## Construct the rating map
user = ratings['userId'].unique()
r_map = pd.DataFrame(columns=movies['movieId'].to_list(), index=user)
for i in range(len(ratings)):
    ind = ratings['userId'][i]
    col = ratings['movieId'][i]
    r_map.at[ind, col] = ratings['rating'][i]
r_map = r_map.fillna(0)

r_map.head().iloc[:, :15]

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
## Construct similarity graph
W = pd.DataFrame(columns=user, index=user)
for u in user:
    for v in user:
        if u == v:
            W.at[u, v] = 0
        else:
            W.at[u, v] = round(eud(r_map, u, v), 3)
W.head().iloc[:, :10]
W.to_csv('user_similarity_matrix', sep=',')

KeyboardInterrupt: 

In [11]:
## Weight normalization
nor_W = W
for id, rows in nor_W.iterrows():
    summ = sum(rows)
    for z in range(len(rows)):
        ori = rows.at[z+1]
        rows.at[z+1] = ori/summ
nor_W.head().iloc[:, :10]
nor_W.to_csv('user_similarity_matrix_normalized', sep=',')

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
1,0.0,0.003265,0.00086,0.000647,0.001664,0.00082,0.00097,0.001664,0.003944,0.000891
2,0.001239,0.0,0.002992,0.000664,0.002992,0.000925,0.000595,0.000996,0.002992,0.000503
3,0.000364,0.003343,0.0,0.000608,0.000608,0.000558,0.003343,0.000608,0.003343,0.003343
4,0.00077,0.002084,0.001708,0.0,0.001286,0.000995,0.001417,0.001474,0.003126,0.001389
5,0.00099,0.004691,0.000854,0.000643,0.0,0.000502,0.001079,0.000657,0.004691,0.000933


In [None]:
## Nearest-neighbor collaborative filtering
new_rmatrix = []
for u in range(len(r_map)):
    row = r_map.loc[u+1].dropna()
    simi = W.loc[u+1].sort_values(ascending=False).index.to_list()
    avg_x = sum(row) / len(row)
    unk = de_nan(r_map, u+1)
    new_row = []  # The estimated rated row

    for va in unk:
        nei = 0
        nbs = []
        nbs_sum = 0

        while len(nbs) < 20 and nei < len(simi):
            if np.isnan(r_map.loc[simi[nei], va]):
                pass
            else:
                nbs.append((simi[nei], r_map.loc[simi[nei], va]))
                nbs_sum += r_map.loc[simi[nei], va]
            nei += 1
        temp = 0

        for b in range(len(nbs)):
            cur_avg = sum(r_map.loc[nbs[b][0]].dropna()) / len(r_map.loc[nbs[b][0]].dropna())
            temp += (nbs[b][1] - cur_avg) * W.loc[u+1, nbs[b][0]] / nbs_sum
        new_row.append(avg_x + temp)
    new_rmatrix.append((unk, new_row))
    #print('\r', u, end='', flush=True)

new_map = recons(new_rmatrix, r_map)
new_map.to_csv('new_map', sep=',')
print(new_map.head().iloc[:, :10])