In [1]:
## Imports
import pandas as pd
import numpy as np
import networkx as nx
import pickle

In [2]:
## Read data
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [4]:
## Construct the rating map
user = ratings['userId'].unique()
r_map_nan = pd.DataFrame(columns=movies['movieId'].to_list(), index=user)
for i in range(len(ratings)):
    ind = ratings['userId'][i]
    col = ratings['movieId'][i]
    r_map_nan.at[ind, col] = ratings['rating'][i]

r_map = r_map_nan.fillna(0)
r_map.head().iloc[:, :15]

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
test = r_map.iloc[:, :1000]
test_nan = r_map_nan.iloc[:, :1000]

In [20]:
## Similarity calculation function
def eud(base, x, y):
    x_r = base.loc[x].values
    y_r = base.loc[y].values
    eudist = 1 / (1 + np.linalg.norm(x_r - y_r))
    return eudist


## Construct similarity graph
def user_sim(r_matrix):
    W = pd.DataFrame(columns=user, index=user)
    for u in user:
        for v in user:
            if u == v:
                W.at[u, v] = 0
            else:
                W.at[u, v] = round(eud(r_matrix, u, v), 3)
    W.to_csv('user_similarity_matrix', sep=',')
    return W

user_sim(test)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
1,0,0.023,0.023,0.02,0.022,0.014,0.024,0.022,0.023,0.023,...,0.023,0.019,0.014,0.019,0.024,0.016,0.022,0.022,0.022,0.02
2,0.023,0,0.111,0.025,0.038,0.016,0.042,0.038,0.068,0.091,...,0.05,0.024,0.014,0.027,0.04,0.016,0.029,0.022,0.047,0.021
3,0.023,0.111,0,0.025,0.038,0.015,0.041,0.037,0.066,0.085,...,0.047,0.024,0.014,0.027,0.041,0.016,0.029,0.022,0.045,0.021
4,0.02,0.025,0.025,0,0.023,0.014,0.024,0.022,0.024,0.025,...,0.025,0.019,0.015,0.02,0.023,0.016,0.022,0.019,0.023,0.019
5,0.022,0.038,0.038,0.023,0,0.016,0.032,0.036,0.034,0.038,...,0.033,0.026,0.014,0.026,0.034,0.016,0.027,0.022,0.035,0.02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.016,0.016,0.016,0.016,0.016,0.013,0.017,0.016,0.016,0.016,...,0.016,0.015,0.015,0.015,0.016,0,0.016,0.016,0.016,0.016
607,0.022,0.029,0.029,0.022,0.027,0.015,0.029,0.027,0.027,0.028,...,0.027,0.022,0.014,0.023,0.027,0.016,0,0.023,0.028,0.022
608,0.022,0.022,0.022,0.019,0.022,0.016,0.023,0.024,0.022,0.022,...,0.022,0.021,0.014,0.021,0.023,0.016,0.023,0,0.023,0.022
609,0.022,0.047,0.045,0.023,0.035,0.016,0.035,0.039,0.04,0.045,...,0.036,0.025,0.014,0.027,0.035,0.016,0.028,0.023,0,0.021


In [9]:
## Weight normalization
def norm_W(w):
    for id in range(len(w)):
        summ = sum(w.iloc[id, :])
        for z in range(len(w)):
            ori = w.iloc[id, :].iat[z]
            w.iloc[id, :].iat[z] = ori/summ
    w.to_csv('user_similarity_matrix_normalized', sep=',')
    return w

W = pd.read_csv('user_similarity_matrix', sep=',', index_col=0)
norm_w = norm_W(W)
norm_w.iloc[:, :10]

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
1,0.000000,0.001711,0.001711,0.001488,0.001637,0.001042,0.001786,0.001637,0.001711,0.001711
2,0.000718,0.000000,0.003463,0.000780,0.001186,0.000499,0.001310,0.001186,0.002122,0.002839
3,0.000761,0.003674,0.000000,0.000828,0.001258,0.000497,0.001357,0.001225,0.002185,0.002814
4,0.001426,0.001783,0.001783,0.000000,0.001640,0.000998,0.001711,0.001569,0.001711,0.001783
5,0.001141,0.001970,0.001970,0.001193,0.000000,0.000830,0.001659,0.001867,0.001763,0.001970
...,...,...,...,...,...,...,...,...,...,...
606,0.001634,0.001634,0.001634,0.001634,0.001634,0.001327,0.001736,0.001634,0.001634,0.001634
607,0.001364,0.001798,0.001798,0.001364,0.001674,0.000930,0.001798,0.001674,0.001674,0.001736
608,0.001627,0.001627,0.001627,0.001405,0.001627,0.001183,0.001701,0.001775,0.001627,0.001627
609,0.001015,0.002169,0.002077,0.001061,0.001615,0.000738,0.001615,0.001800,0.001846,0.002077


In [6]:
## NaN detection function
def de_nan(base, rx, row):
    ori = base.iloc[rx].index.to_list()
    cle = row.index.to_list()
    for dex in cle:
        ori.remove(dex)
    return ori


## Reconstruction function
def recons(new_data, template):
    temp = template
    for u in range(len(new_data)):
        for i in range(len(new_data[u][0])):
            temp.at[u+1, new_data[u][0][i]] = new_data[u][1][i]
    return temp


## Nearest-neighbor collaborative filtering
def NNCF(rates, user_sim_m):
    new_map = rates  # Copy the unfinished rating map
    new_rates = []  # New rating collection

    for k in range(len(rates)):
        row = rates.iloc[k].dropna()
        simi = user_sim_m.iloc[k].sort_values(ascending=False).index.to_list()  # Sorted similar users
        if len(row) == 0:
            new_map.iloc[k, :] = 2.8 * np.ones(len(rates.iloc[k]))
            new_rates.append(rates.iloc[k].index.tolist())
        else:
            avg_x = sum(row) / len(row)
            unk = de_nan(rates, k, row)  # Unrated movies
            new_rates.append(unk)

            for va in unk:
                temp = 0
                count = 0
                nei = 0
                weight_sum = 0

                while count < 10 and nei < len(simi):
                    sco = rates.loc[int(simi[nei]), va]
                    if np.isnan(sco):
                        pass
                    else:
                        count += 1
                        weight_sum += user_sim_m.loc[k+1, simi[nei]]
                        neighbor = rates.loc[int(simi[nei])].dropna()
                        cur_avg = sum(neighbor) / len(neighbor)
                        temp += (sco - cur_avg) * user_sim_m.loc[k+1, simi[nei]]

                    nei += 1

                if weight_sum == 0:
                    point = round(avg_x, 1)
                else:
                    temp = temp / weight_sum
                    point = round(avg_x + temp, 1)

                new_map.loc[k+1, va] = point
            print('\r', k + 1, end='', flush=True)

    new_map.to_csv('new_map', sep=',')
    return new_map, new_rates


W = pd.read_csv('user_similarity_matrix', sep=',', index_col=0)
matrix, inds = NNCF(test_nan, W)
matrix.iloc[:, :15]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_map.loc[k+1, va] = point


 162

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_map.iloc[k, :] = 2.8 * np.ones(len(rates.iloc[k]))


 610

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
1,4.0,4.1,4.0,3.3,3.8,4.0,3.8,3.6,3.7,4.4,4.4,3.5,4.0,4.7,4.0
2,3.9,3.3,3.0,2.4,3.2,4.0,2.6,2.7,2.8,3.4,3.2,2.6,3.1,3.7,3.1
3,1.6,0.9,0.7,0.1,0.9,1.6,0.2,0.4,0.5,1.0,0.8,0.3,0.8,1.4,0.8
4,3.9,3.5,2.8,2.7,3.2,4.2,2.8,3.0,3.1,3.6,3.4,2.8,3.4,4.1,3.3
5,4.0,3.9,3.2,2.6,3.3,4.1,2.9,2.9,2.9,3.1,3.7,2.7,3.4,4.0,3.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,3.8,3.5,3.1,3.7,4.2,2.5,3.3,3.3,3.8,2.5,3.3,3.7,4.2,3.5
607,4.0,3.8,3.5,3.1,3.8,4.4,3.1,3.3,3.4,3.8,3.0,3.3,3.7,4.2,3.6
608,2.5,2.0,2.0,1.9,2.6,3.3,1.9,2.3,2.2,4.0,2.9,2.2,2.6,3.1,2.5
609,3.0,3.3,3.0,2.5,3.2,3.7,2.6,2.8,2.8,4.0,3.1,2.7,3.1,3.6,3.0


In [10]:
## Genre similarity function
def genre_simi(a, b):
        overlap = 0
        for gens in a:
            if gens in b:
                overlap += 1
        all_included = len(np.unique(np.array(a + b)))
        sim = overlap / all_included

        return sim

In [11]:
## Personalized PageRank
def Personalized_PageRank(ori_table, target_ind):
    rec = []  # Recommendation order
    for us in range(len(target_ind)):
        orig_sco = []  # Original Pr for Page Rank
        sco_dict = {}
        relation = nx.Graph()  # Temperory movie similarity graph

        for t in range(len(target_ind[us])):
            mov_id = int(target_ind[us][t])

            orig_sco.append(ori_table.loc[us+1, str(mov_id)])

            gen = movies.loc[mov_id, 'genres']
            gen_type = gen.split('|')
            for f in range(t + 1, len(target_ind[us])):
                genn = movies.loc[int(target_ind[us][f]), 'genres']
                genn_type = genn.split('|')
                sim = genre_simi(gen_type, genn_type)
                if sim != 0.0:
                    relation.add_edge(movies.loc[mov_id, 'movieId'], 
                                    movies.loc[int(target_ind[us][f]), 'movieId'], 
                                    weight=sim
                                    )
                    
        orig_sco = np.array(orig_sco)
        orig_sco = orig_sco / orig_sco.sum()
        for o in range(len(orig_sco)):
            sco_dict[int(target_ind[us][o])] = orig_sco[o]

        r = nx.pagerank(relation, alpha=0.85, personalization=sco_dict)  # Personalized PageRank
        sorted_r = sorted(r.items(), key=lambda y: y[1], reverse=True)
        rec.append(dict(sorted_r))

    return rec


## PageRank data preprocessing
filled_map = pd.read_csv('new_map', sep=',', index_col=0)

rec_rst = Personalized_PageRank(filled_map, inds)

In [12]:
rec_table = pd.DataFrame(index=user, columns=[top for top in range(1, 101)])
for i in range(len(user)):
    rec_table.iloc[i] = list(rec_rst[i].keys())[:100]

rec_table  # The top 100 recommended movies' id for Users

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
1,218,994,106,324,82,568,973,1300,583,984,...,944,1093,1132,1231,448,679,835,1051,1103,1207
2,1292,106,218,324,568,1300,82,345,583,984,...,321,944,1104,1193,1225,178,476,14,341,385
3,496,495,1236,148,1292,106,324,99,568,583,...,1189,617,639,765,45,319,195,359,926,1051
4,324,1292,218,568,82,583,984,1300,96,562,...,326,448,175,184,209,307,337,354,524,685
5,1292,82,106,324,568,583,1300,973,984,994,...,1051,1104,1193,1225,62,211,299,326,341,448
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,106,568,96,218,322,430,562,272,635,72,...,43,219,254,510,636,116,162,581,213,501
607,1292,106,324,568,583,1300,82,96,218,345,...,959,1104,14,341,926,1051,1132,1193,1225,1246
608,99,77,108,363,602,246,1189,581,162,116,...,1051,1132,14,175,209,211,299,326,1225,1246
609,1292,106,324,568,1300,82,96,218,345,583,...,1104,1207,1225,1280,1050,211,326,341,926,1051


In [13]:
rec_table.to_csv('top-100 recommended', sep=',')

In [14]:
def list2txt(li):
    with open(r'pagerank_result.txt', 'w') as fp:
        for item in li:
            # write each item on a new line
            fp.write("%s\n" % str(item))
        print('Done')

list2txt(rec_rst)

Done
