In [3]:
import json
import numpy as np
import pandas as pd
import pickle
import copy

In [4]:
rdf = pd.read_csv('./ratings.dat', sep='::', names=["user_id", "item_id", "rating", "timestamp"])
rdf.drop(columns=['timestamp'], inplace=True)
# rdf = rdf.rename(columns={"userId": "user_id", "movieId": "item_id"})
rdf.head()

Unnamed: 0,user_id,item_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [8]:
item_df = pd.read_csv('./movies.dat', sep='::', names=['item_id', 'title', 'genres'],encoding="latin-1", engine='python')
# item_df = item_df.rename(columns={"movieId": "item_id"})
item_df.head()

Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
item_genre_dict = dict()
for i in range(len(item_df)):
    genre_str = item_df.at[i, 'genres']
    genre_list = genre_str.split('|')
    item_genre_dict[item_df.at[i, 'item_id']] = genre_list

In [10]:
item_set = set(rdf['item_id'].unique())
user_set = set(rdf['user_id'].unique())
print('item num = ' + str(len(item_set)))
print('user num = ' + str(len(user_set)))

item num = 3706
user num = 6040


In [11]:
# count the number for each genre and sort
import operator
genre_count = dict()
for l in item_genre_dict:
    for g in item_genre_dict[l]:
        if not g in genre_count:
            genre_count[g] = 1
        else:
            genre_count[g] += 1

genre_count_sorted = sorted(genre_count.items(), key=operator.itemgetter(1), reverse=True)
genre_count_sorted

[('Drama', 1603),
 ('Comedy', 1200),
 ('Action', 503),
 ('Thriller', 492),
 ('Romance', 471),
 ('Horror', 343),
 ('Adventure', 283),
 ('Sci-Fi', 276),
 ("Children's", 251),
 ('Crime', 211),
 ('War', 143),
 ('Documentary', 127),
 ('Musical', 114),
 ('Mystery', 106),
 ('Animation', 105),
 ('Fantasy', 68),
 ('Western', 68),
 ('Film-Noir', 44)]

In [12]:
key_genre = ['Comedy', 'Thriller', 'Sci-Fi', 'Horror', 'Romance', 'Action', 'Crime', 'Adventure', "Children's"]

# get the key_genre->item_list dict
key_genre_item = dict()
for k in key_genre:
    key_genre_item[k] = list()
for item in item_genre_dict:
    for g in item_genre_dict[item]:
        if g in key_genre:
            key_genre_item[g].append(item)

In [13]:
# collect all the items with key genres
key_item_set = set()
for genre in key_genre_item:
    key_item_set |= set(key_genre_item[genre])

nonkey_item_set = item_set - key_item_set

In [14]:
# remove the non-key genre items in rdf
remove_list = []
for item in nonkey_item_set:
    remove_list += rdf.index[rdf['item_id'] == item].values.tolist()   

In [15]:
rdf.drop(remove_list, inplace=True)

In [16]:
rdf.reset_index(drop=True, inplace=True)
rating_df = copy.copy(rdf)

In [17]:
rdf = copy.copy(rating_df)

In [18]:
# iteratively remove items and users with less than 2 reviews
rdf.reset_index(drop=True, inplace=True)

rdf['user_freq'] = rdf.groupby('user_id')['user_id'].transform('count')
rdf.drop(rdf.index[rdf['user_freq'] <= 4], inplace=True)
rdf.reset_index(drop=True, inplace=True)
rdf['item_freq'] = rdf.groupby('item_id')['item_id'].transform('count')
rdf.drop(rdf.index[rdf['item_freq'] <= 4], inplace=True)
rdf.reset_index(drop=True, inplace=True)
rdf['user_freq'] = rdf.groupby('user_id')['user_id'].transform('count')
rdf.reset_index(drop=True, inplace=True)
rdf['user_id'].value_counts()

4169    1626
1680    1511
1941    1398
4277    1298
2063    1176
        ... 
5174      10
2488      10
4651       9
1534       8
2584       5
Name: user_id, Length: 6039, dtype: int64

In [19]:
item_list = rdf['item_id'].unique()
user_list = rdf['user_id'].unique()
print('item num = ' + str(len(item_list)))
print('user num = ' + str(len(user_list)))

item num = 2526
user num = 6039


In [20]:
# get the user and item str id->int id dict
i = 0
user_id_dict = dict()
for u in user_list:
    if not u in user_id_dict:
        user_id_dict[u] = i
        i += 1
j = 0
item_id_dict = dict()
for i in item_list:
    if not i in item_id_dict:
        item_id_dict[i] = j
        j += 1

In [21]:
print('sparsity: ' + str(len(rdf) * 1.0 / (len(user_list) * len(item_list))))

sparsity: 0.05517606132846972


In [22]:
# get the df of train, vali, and test set
rdf.reset_index(inplace=True, drop=True)
train_df = rdf.copy()
vali_df = rdf.copy()
test_df = rdf.copy()

train_ratio = 0.6
vali_ratio = 0.2
test_ratio = 0.2
num_all = len(rdf)
vali_idx = []
test_idx = []

test_vali_idx = []
i = 0
num_user = len(user_list)
for u in user_list:
    u_idx = train_df.index[train_df['user_id'] == u]
    idx_len = len(u_idx)
    test_len = int(idx_len * (test_ratio + vali_ratio))
    if test_len == 0:
        test_len = 1
    tmp = np.random.choice(u_idx, size=test_len, replace=False)
    test_vali_idx += tmp.tolist()
    i += 1
    if i % 5000 == 0:
        print(str(i) + '/' + str(num_user))

# tmp = (np.random.choice(range(num_all), size=(test_len+vali_len), replace=False)).tolist()
test_len = int(len(test_vali_idx) * test_ratio / (test_ratio + vali_ratio))
vali_len = int(len(test_vali_idx) - test_len)
test_idx = (np.random.choice(test_vali_idx, size=test_len, replace=False)).tolist()
vali_idx = (np.random.choice(test_vali_idx, size=vali_len, replace=False)).tolist()

test_set = set(test_idx)
vali_set = set(vali_idx)
train_set = set(range(num_all)) - test_set - vali_set
train_idx = list(train_set)
train_df.drop((test_idx + vali_idx), axis=0, inplace=True)
test_df.drop((train_idx + vali_idx), axis=0, inplace=True)
vali_df.drop((train_idx + test_idx), axis=0, inplace=True)

5000/6039


In [23]:
rdf.drop(columns=['rating'], inplace=True)
train_df.drop(columns=['rating'], inplace=True)
test_df.drop(columns=['rating'], inplace=True)
vali_df.drop(columns=['rating'], inplace=True)

In [24]:
# get the matrix of train, vali and test set

train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
vali_df.reset_index(drop=True, inplace=True)
rdf.reset_index(drop=True, inplace=True)
train = np.zeros((len(user_list), len(item_list)))
test = np.zeros((len(user_list), len(item_list)))
vali = np.zeros((len(user_list), len(item_list)))
for r in range(len(train_df)):
    train[user_id_dict[train_df.at[r, 'user_id']], item_id_dict[train_df.at[r, 'item_id']]] = 1.0
for r in range(len(test_df)):
    test[user_id_dict[test_df.at[r, 'user_id']], item_id_dict[test_df.at[r, 'item_id']]] = 1.0
for r in range(len(vali_df)):
    vali[user_id_dict[vali_df.at[r, 'user_id']], item_id_dict[vali_df.at[r, 'item_id']]] = 1.0

In [25]:
# get the user int id-> str id list, and the same for item 
item_list = item_id_dict.keys()
item_idd_list = list()
for i in range(len(item_list)):
    item_idd_list.append('')
for item in item_id_dict:
    item_idd_list[item_id_dict[item]] = item

user_list = user_id_dict.keys()
user_idd_list = list()
for i in range(len(user_list)):
    user_idd_list.append('')
for user in user_id_dict:
    user_idd_list[user_id_dict[user]] = user
    
# get the item int id->genres list
item_idd_genre_list = list()
for i in range(len(item_idd_list)):
    item_idd_genre_list.append(item_genre_dict[item_idd_list[i]])

In [26]:
train_df.drop('user_freq', axis=1, inplace=True)
train_df.drop('item_freq', axis=1, inplace=True)
vali_df.drop('user_freq', axis=1, inplace=True)
vali_df.drop('item_freq', axis=1, inplace=True)
test_df.drop('user_freq', axis=1, inplace=True)
test_df.drop('item_freq', axis=1, inplace=True)
rdf.drop('user_freq', axis=1, inplace=True)
rdf.drop('item_freq', axis=1, inplace=True)

In [27]:
# get df for rdf, train, vali, test with int id for user and item
import copy
rating_df = copy.copy(rdf)
for i in range(len(rdf)):
    rating_df.at[i, 'user_id'] = user_id_dict[rating_df.at[i, 'user_id']]
    rating_df.at[i, 'item_id'] = item_id_dict[rating_df.at[i, 'item_id']]

training_df = copy.copy(train_df)
for i in range(len(training_df)):
    training_df.at[i, 'user_id'] = user_id_dict[training_df.at[i, 'user_id']]
    training_df.at[i, 'item_id'] = item_id_dict[training_df.at[i, 'item_id']]

valiing_df = copy.copy(vali_df)
for i in range(len(valiing_df)):
    valiing_df.at[i, 'user_id'] = user_id_dict[valiing_df.at[i, 'user_id']]
    valiing_df.at[i, 'item_id'] = item_id_dict[valiing_df.at[i, 'item_id']]

testing_df = copy.copy(test_df)
for i in range(len(testing_df)):
    testing_df.at[i, 'user_id'] = user_id_dict[testing_df.at[i, 'user_id']]
    testing_df.at[i, 'item_id'] = item_id_dict[testing_df.at[i, 'item_id']]

In [28]:
# generate the rating list for each key genre, get the genre->ratings dict
rdf.reset_index(drop=True, inplace=True)
key_genre_rating = dict()
for k in key_genre:
    key_genre_rating[k] = 0.0
for r in range(len(rdf)):
    item = rdf.at[r, 'item_id']
    gl = item_genre_dict[item]
    for k in key_genre:
        if k in gl:
            key_genre_rating[k] += 1.0

# get the item int id->genres list
genre_item_vector = dict()
for k in key_genre:
    genre_item_vector[k] = np.zeros((1, len(item_list)))
for i in range(len(item_idd_genre_list)):
    genre_list = item_idd_genre_list[i]
    for g in genre_list:
        if g in key_genre:
            genre_item_vector[g][0, i] = 1.0

In [85]:
with open("item_genre_dict.pkl", "wb") as f:
    pickle.dump(item_genre_dict, f, pickle.HIGHEST_PROTOCOL)
with open("genre_item_vector.pkl", "wb") as f:
    pickle.dump(genre_item_vector, f, pickle.HIGHEST_PROTOCOL)
with open("key_genre.pkl", "wb") as f:
    pickle.dump(key_genre, f, pickle.HIGHEST_PROTOCOL)
with open("user_id_dict.pkl", "wb") as f:
    pickle.dump(user_id_dict, f, pickle.HIGHEST_PROTOCOL)
with open("item_id_dict.pkl", "wb") as f:
    pickle.dump(item_id_dict, f, pickle.HIGHEST_PROTOCOL)
# with open("rdf.pkl", "wb") as f:
#     pickle.dump(rdf, f, pickle.HIGHEST_PROTOCOL)
with open("rating_df.pkl", "wb") as f:
    pickle.dump(rating_df, f, pickle.HIGHEST_PROTOCOL)
with open("training_df.pkl", "wb") as f:
    pickle.dump(training_df, f, pickle.HIGHEST_PROTOCOL)
with open("valiing_df.pkl", "wb") as f:
    pickle.dump(valiing_df, f, pickle.HIGHEST_PROTOCOL)
with open("testing_df.pkl", "wb") as f:
    pickle.dump(testing_df, f, pickle.HIGHEST_PROTOCOL)
with open("item_idd_genre_list.pkl", "wb") as f:
    pickle.dump(item_idd_genre_list, f, pickle.HIGHEST_PROTOCOL)
with open("item_idd_list.pkl", "wb") as f:
    pickle.dump(item_idd_list, f, pickle.HIGHEST_PROTOCOL)
with open("user_idd_list.pkl", "wb") as f:
    pickle.dump(user_idd_list, f, pickle.HIGHEST_PROTOCOL)
with open("key_genre_rating.pkl", "wb") as f:
    pickle.dump(key_genre_rating, f, pickle.HIGHEST_PROTOCOL)
    
with open("train.mat", "wb") as f:
    np.save(f, train)
with open("test.mat", "wb") as f:
    np.save(f, test)
with open("vali.mat", "wb") as f:
    np.save(f, vali)

In [30]:
# count the number for each genre and sort
import pickle
from operator import itemgetter
# item_list = pickle.load(open('./rdf.pkl'))['item_id'].unique()
# item_genre_dict = pickle.load(open('./item_genre_dict.pkl'))
# key_genre = pickle.load(open('./key_genre.pkl'))

genre_count = dict()
for i in item_list:
    gl = item_genre_dict[i]
    for g in gl:
        if g in key_genre:
            if not g in genre_count:
                genre_count[g] = 1
            else:
                genre_count[g] += 1

# with open("genre_count.pkl", "wb") as f:
#     pickle.dump(genre_count, f, pickle.HIGHEST_PROTOCOL)
                
genre_count_sorted = sorted(genre_count.items(), key=itemgetter(1), reverse=True)
genre_count_sorted

[('Comedy', 1090),
 ('Action', 480),
 ('Thriller', 467),
 ('Romance', 441),
 ('Horror', 323),
 ('Adventure', 274),
 ('Sci-Fi', 270),
 ("Children's", 248),
 ('Crime', 193)]

In [33]:
item_idd_genre_list = np.array(item_idd_genre_list)


array([list(['Animation', "Children's", 'Musical']),
       list(['Musical', 'Romance']),
       list(['Animation', "Children's", 'Comedy']), ...,
       list(['Romance', 'War']), list(['Adventure']),
       list(['Action', 'Drama', 'Thriller'])], dtype=object)

In [48]:
train.shape[0]

6039

In [53]:
key_genre

['Comedy',
 'Thriller',
 'Sci-Fi',
 'Horror',
 'Romance',
 'Action',
 'Crime',
 'Adventure',
 "Children's"]

In [55]:
import numpy as np
import pickle
import copy as copy

# item_idd_genre_list = np.array(item_idd_genre_list)


mask = 1.0 * (train > 0) #train = user x item matrix == interaction matrix. This mask is not doing anything?
user_genre_count = list()
for u in range(train.shape[0]):
    temp_genre_count = copy.copy(genre_count)
    mask_u = mask[u, :]
    gll = item_idd_genre_list[mask_u == 1.0] #for each user, take out all the movie that they have interaction before
    for gl in gll: #for each movie in the list of interacted movies
        for g in gl: #for each genres of movie
            if g in key_genre:
                temp_genre_count[g] -= 1
    user_genre_count.append(temp_genre_count)
# with open("user_genre_count.pkl", "wb") as f:
#     pickle.dump(user_genre_count, f, pickle.HIGHEST_PROTOCOL)

In [54]:
genre_count

{"Children's": 248,
 'Romance': 441,
 'Comedy': 1090,
 'Action': 480,
 'Adventure': 274,
 'Sci-Fi': 270,
 'Thriller': 467,
 'Crime': 193,
 'Horror': 323}

In [56]:
user_genre_count

[{"Children's": 234,
  'Romance': 436,
  'Comedy': 1079,
  'Action': 478,
  'Adventure': 270,
  'Sci-Fi': 268,
  'Thriller': 464,
  'Crime': 192,
  'Horror': 323},
 {"Children's": 248,
  'Romance': 426,
  'Comedy': 1074,
  'Action': 442,
  'Adventure': 262,
  'Sci-Fi': 261,
  'Thriller': 444,
  'Crime': 186,
  'Horror': 322},
 {"Children's": 245,
  'Romance': 436,
  'Comedy': 1072,
  'Action': 465,
  'Adventure': 258,
  'Sci-Fi': 266,
  'Thriller': 462,
  'Crime': 193,
  'Horror': 322},
 {"Children's": 247,
  'Romance': 440,
  'Comedy': 1090,
  'Action': 465,
  'Adventure': 270,
  'Sci-Fi': 263,
  'Thriller': 464,
  'Crime': 192,
  'Horror': 321},
 {"Children's": 244,
  'Romance': 423,
  'Comedy': 1055,
  'Action': 456,
  'Adventure': 267,
  'Sci-Fi': 259,
  'Thriller': 437,
  'Crime': 179,
  'Horror': 318},
 {"Children's": 234,
  'Romance': 419,
  'Comedy': 1067,
  'Action': 474,
  'Adventure': 269,
  'Sci-Fi': 269,
  'Thriller': 464,
  'Crime': 192,
  'Horror': 323},
 {"Children's": 

In [89]:
genre_avg_like = dict()
for k in key_genre:
    genre_avg_like[k] = key_genre_rating[k] * 1.0 / genre_count[k]

In [90]:
genre_avg_like_sorted = sorted(genre_avg_like.items(), key=itemgetter(1), reverse=True)
genre_avg_like_sorted

[('Sci-Fi', 582.5407407407407),
 ('Action', 536.3041666666667),
 ('Adventure', 488.8284671532847),
 ('Crime', 412.07253886010363),
 ('Thriller', 406.1134903640257),
 ('Romance', 334.42857142857144),
 ('Comedy', 326.9908256880734),
 ("Children's", 291.06451612903226),
 ('Horror', 236.37151702786377)]

In [1]:
import pandas as pd

obj = pd.read_pickle(r'training_df.pkl')


In [2]:
obj.head()

Unnamed: 0,user_id,item_id
0,0,0
1,0,2
2,1,4
3,1,5
4,1,6


In [6]:
dict1 = {"Nepal": "Kathmandu", "Italy": "Rome", "England": "London"}

In [8]:
tmp = []
for key, value in sorted(dict1.iteritems(), key = lambda x: x[0]):
    tmp.append(value)
rstd = np.std(tmp) / (np.mean(tmp) + 1e-10)

AttributeError: 'dict' object has no attribute 'iteritems'

In [10]:
def relative_std(dictionary):
    tmp = []
    for key, value in sorted(dictionary.iteritems(), key = lambda x: x[0]):
        tmp.append(value)
    rstd = np.std(tmp) / (np.mean(tmp) + 1e-10)
    return rstd
