In [126]:
import pandas as pd
import numpy as np
import logging
from functools import lru_cache
from pathlib import Path

logging.getLogger().setLevel(logging.INFO)

def read_dat_files():
    logging.info("reading")
    R = pd.read_csv("ratings.csv",names=["userID","movieID","rating","timestamp"],skiprows=1)
    I = pd.read_csv("movies.csv",names=["movieID","title","genre"],skiprows=1)
    U = pd.read_csv("tags.csv",skiprows=1)
    logging.info("Done reading")
    logging.info("scaling")
    R["rating"] = np.array(R["rating"],dtype=float) - 3.0
    logging.info("Done scaling")
    return R, I, U 

R, I, U = read_dat_files()

INFO:root:reading
INFO:root:Done reading
INFO:root:scaling
INFO:root:Done scaling


In [247]:
def create_movie_per_genre()->pd.DataFrame:
    file = Path("./movies_genres.csv")
    if file.is_file():
        logging.debug("create_movie_per_genre: file does exist")
        return pd.read_csv(file);
    logging.debug("create_movie_per_genre: file does not exist")
    R_I = pd.merge(R,I,how="inner",on="movieID")
    retval = pd.DataFrame()
    for i, row in R_I.iterrows():
        logging.debug(i/len(R_I))
        genres = row.genre.split("|")
        for genre in genres:
            row["genre"] = genre
            retval = pd.concat([retval,pd.DataFrame(row).T], ignore_index=True)
    retval.to_csv(file)
    logging.debug(retval)
    return retval
    
data = create_movie_per_genre()

In [128]:
logging.info("Popularity of genres: ")
test = {}
for i, d in data.groupby("genre"):
    test[i] = len(set(d.movieID))
logging.info({k: v for k, v in sorted(test.items(), key=lambda item: item[1])})

INFO:root:Popularity of genres: 
INFO:root:{'(no genres listed)': 17, 'Film-Noir': 121, 'IMAX': 153, 'Western': 168, 'War': 366, 'Musical': 394, 'Animation': 447, 'Documentary': 487, 'Mystery': 537, 'Children': 582, 'Fantasy': 653, 'Sci-Fi': 791, 'Horror': 872, 'Crime': 1092, 'Adventure': 1116, 'Romance': 1541, 'Action': 1543, 'Thriller': 1717, 'Comedy': 3307, 'Drama': 4328}


In [129]:
logging.info("avarage, min and max genres of movies: ")
test = []
for i, d in data.groupby("movieID"):
    test.append(len(set(d.genre)))
    if (len(set(d.genre)) ==10):
        logging.debug(d)
logging.info("max: %s, min: %s, mean: %s", np.max(test),np.min(test),np.mean(test))

INFO:root:avarage, min and max genres of movies: 
INFO:root:max: 10, min: 1, mean: 2.2316346790205164


In [130]:
logging.info(data[(data.userID == 289) & (data.movieID == 1125)])

INFO:root:       Unnamed: 0  userID  movieID  rating   timestamp  \
48404       48404     289     1125     0.0  1328935470   
48405       48405     289     1125     0.0  1328935470   

                                        title   genre  
48404  Return of the Pink Panther, The (1975)  Comedy  
48405  Return of the Pink Panther, The (1975)   Crime  


In [131]:
def create_user_profile(userID,data):
    data = data[data.userID==userID]
    return data.groupby("genre").rating.sum()

In [132]:
logging.debug(create_user_profile(1,data))

In [133]:
logging.info(" genres: %s",set(data.genre))

INFO:root: genres: {'Romance', 'Fantasy', 'Thriller', 'Musical', 'Crime', 'War', 'Western', 'Film-Noir', 'IMAX', 'Action', 'Documentary', 'Drama', 'Mystery', 'Children', 'Adventure', 'Animation', '(no genres listed)', 'Comedy', 'Sci-Fi', 'Horror'}


In [134]:
def get_possible_genres():
    return set(data[data.genre != "(no genres listed)"].genre)

In [45]:
logging.info("how many is gonna deleted: %s", data[data.genre=='(no genres listed)'])

INFO:root:how many is gonna deleted:         Unnamed: 0  userID  movieID  rating   timestamp  \
231254      231254      56   128620     2.0  1467003913   
231296      231296      56   160590     2.0  1467095789   
238764      238764      73   141866     1.0  1469772876   
250916      250916     200   136592    -1.5  1438020227   
256197      256197     287   117192     2.0  1473445036   
256198      256198     402   117192     1.5  1462945915   
257396      257396     299    83829     1.5  1344180332   
258261      258261     324   149532     0.0  1451519751   
258997      258997     371   122888     2.0  1473624419   
261375      261375     457   126106     0.5  1471409573   
263825      263825     547   134025     0.0  1432654721   
263854      263854     547   151307     1.5  1472400501   
264157      264157     572   132952     1.0  1436466718   
265130      265130     624   129250    -2.5  1447868930   
265202      265202     624   143410    -1.0  1474224802   
265307      265307 

In [None]:
possible_genre = ['Romance',
 'Fantasy',
 'Thriller',
 'Musical',
 'Crime',
 'War',
 'Western',
 'Film-Noir',
 'IMAX',
 'Action',
 'Documentary',
 'Drama',
 'Mystery',
 'Children',
 'Adventure',
 'Animation',
 'Comedy',
 'Sci-Fi',
 'Horror']

In [167]:
def dot(movieId,userID):
    u = create_user_profile(userID,data)
    genres = list(set(data[data.movieID==movieId].genre))
    logging.debug(genres)
    for genre in possible_genre:
        if genre not in genres:
            u[genre] = 0
    return u.sum()
logging.debug(dot(5679,526))

In [83]:
def user526():
    R_I2 = pd.merge(R,I,how="inner",on="movieID")
    del R_I2["timestamp"]
    
    R_I2 = R_I2[R_I2.userID==526]
    del R_I2["userID"]
    return R_I2
logging.info(user526().to_latex(index=False))

INFO:root:\begin{tabular}{rrll}
\toprule
 movieID &  rating &                                              title &                                             genre \\
\midrule
    7153 &     0.5 & Lord of the Rings: The Return of the King, The ... &                    Action|Adventure|Drama|Fantasy \\
    4995 &     2.0 &                           Beautiful Mind, A (2001) &                                     Drama|Romance \\
    5679 &     1.0 &                                   Ring, The (2002) &                           Horror|Mystery|Thriller \\
    6373 &     1.0 &                              Bruce Almighty (2003) &                      Comedy|Drama|Fantasy|Romance \\
    8644 &     0.5 &                                    I, Robot (2004) &                  Action|Adventure|Sci-Fi|Thriller \\
    8368 &     1.0 &    Harry Potter and the Prisoner of Azkaban (2004) &                            Adventure|Fantasy|IMAX \\
       1 &     1.0 &                                   Toy St

In [62]:
user526_ratings = (user526().rating)
logging.info("user 526 scaled ratings: mean: %s, median: %s",np.mean(user526_ratings), np.median(user526_ratings))
logging.info("user 526 positive: %s",len(user526_ratings[user526_ratings>0]))
logging.info("user 526 negative: %s",len(user526_ratings[user526_ratings<0]))
logging.info("user 526 neutral: %s",len(user526_ratings[user526_ratings==0]))
user526_ratings = (user526().rating+3)
logging.info("user 526 scaled ratings: mean: %s, median: %s",np.mean(user526_ratings), np.median(user526_ratings))


INFO:root:user 526 scaled ratings: mean: 1.1111111111111112, median: 1.0
INFO:root:user 526 positive: 26
INFO:root:user 526 negative: 0
INFO:root:user 526 neutral: 1
INFO:root:user 526 scaled ratings: mean: 4.111111111111111, median: 4.0


In [67]:
logging.info(create_user_profile(526,data).to_latex())

INFO:root:\begin{tabular}{lr}
\toprule
{} &  rating \\
genre     &         \\
\midrule
Action    &     7.0 \\
Adventure &     9.5 \\
Animation &     6.5 \\
Children  &     6.5 \\
Comedy    &     5.0 \\
Crime     &     3.0 \\
Drama     &    14.5 \\
Fantasy   &     5.0 \\
Horror    &     3.5 \\
IMAX      &     4.0 \\
Mystery   &     6.0 \\
Romance   &     5.0 \\
Sci-Fi    &    13.0 \\
Thriller  &    14.5 \\
\bottomrule
\end{tabular}



In [168]:
five_rec = {}
for i in set(data.movieID):
    five_rec[i] = dot(i,526)
five_rec = {k: v for k, v in sorted(five_rec.items(), key=lambda item: item[1])}

In [169]:
for i in sorted(five_rec.items(), key=lambda item: -item[1])[:5]:
    logging.info("movieID: %s, recomdation score: %s",i[0],i[1])

INFO:root:movieID: 5018, recomdation score: 70.5
INFO:root:movieID: 6902, recomdation score: 67.5
INFO:root:movieID: 26701, recomdation score: 64.5
INFO:root:movieID: 43932, recomdation score: 63.5
INFO:root:movieID: 81132, recomdation score: 63.0


In [191]:
R_I = pd.merge(R,I,how="inner",on="movieID")
df = R_I[(R_I.movieID==5018)|(R_I.movieID==6902)|(R_I.movieID==26701)|(R_I.movieID==43932)|(R_I.movieID==81132)]
del df["userID"]
del df["timestamp"]
del df["rating"]
logging.info(df.to_latex(index=False))

INFO:root:\begin{tabular}{rll}
\toprule
 movieID &                                              title &                                              genre \\
\midrule
    6902 &                               Interstate 60 (2002) & Adventure|Comedy|Drama|Fantasy|Mystery|Sci-Fi|T... \\
    6902 &                               Interstate 60 (2002) & Adventure|Comedy|Drama|Fantasy|Mystery|Sci-Fi|T... \\
    6902 &                               Interstate 60 (2002) & Adventure|Comedy|Drama|Fantasy|Mystery|Sci-Fi|T... \\
   26701 & Patlabor: The Movie (Kidô keisatsu patorebâ: Th... & Action|Animation|Crime|Drama|Film-Noir|Mystery|... \\
   81132 &                                      Rubber (2010) & Action|Adventure|Comedy|Crime|Drama|Film-Noir|H... \\
   43932 &                                       Pulse (2006) & Action|Drama|Fantasy|Horror|Mystery|Sci-Fi|Thri... \\
    5018 &                                    Motorama (1991) & Adventure|Comedy|Crime|Drama|Fantasy|Mystery|Sc... \\
\bottom

In [248]:
def dot_part2(movieId,userID):
    data = create_movie_per_genre()
    user = data[data.userID==userID]
    test = pd.DataFrame()
    for i,d in user.iterrows():
        len_genres = len(list(set(user[user.movieID==d.movieID].genre)))
        d.rating = (d.rating ) /np.sqrt(len_genres)
        test = test.append(d)
    u = test.groupby("genre").rating.sum()
    genres = list(set(data[data.movieID==movieId].genre))
    logging.debug(u)
    for genre in possible_genre:
        if genre not in genres:
            u[genre] = 0
        if genre in u:
            u[genre] *= 1/np.sqrt(len(genres))
    return u.sum()
logging.debug(dot_part2(52328,526))

In [244]:
five_rec = {}
for i in set(data.movieID):
    five_rec[i] = dot_part2(i,526)
five_rec = {k: v for k, v in sorted(five_rec.items(), key=lambda item: -item[1])}

INFO:root:create_movie_per_genre: file does exist
INFO:root:genre
Action       3.219605
Adventure    4.620996
Animation    3.043645
Children     3.043645
Comedy       2.360353
Crime        1.272392
Drama        8.378165
Fantasy      2.386936
Horror       1.943376
IMAX         2.369528
Mystery      3.056849
Romance      2.938587
Sci-Fi       6.916827
Thriller     8.664438
Name: rating, dtype: float64
INFO:root:create_movie_per_genre: file does exist
INFO:root:genre
Action       3.219605
Adventure    4.620996
Animation    3.043645
Children     3.043645
Comedy       2.360353
Crime        1.272392
Drama        8.378165
Fantasy      2.386936
Horror       1.943376
IMAX         2.369528
Mystery      3.056849
Romance      2.938587
Sci-Fi       6.916827
Thriller     8.664438
Name: rating, dtype: float64
INFO:root:create_movie_per_genre: file does exist
INFO:root:genre
Action       3.219605
Adventure    4.620996
Animation    3.043645
Children     3.043645
Comedy       2.360353
Crime        1.272

In [249]:
for i in sorted(five_rec.items(), key=lambda item: -item[1])[:5]:
    logging.info("movieID: %s, recomdation score: %s",i[0],i[1])

INFO:root:movieID: 52328, recomdation score: 14.29021272462629
INFO:root:movieID: 8361, recomdation score: 14.221406070274387
INFO:root:movieID: 48774, recomdation score: 14.221406070274387
INFO:root:movieID: 58025, recomdation score: 14.221406070274387
INFO:root:movieID: 91500, recomdation score: 14.221406070274387


In [250]:

df = I[(I.movieID==8361)|(I.movieID==48774)|(I.movieID==52328)|(I.movieID==58025)|(I.movieID==91500)]

logging.info(df.to_latex(index=False))

INFO:root:\begin{tabular}{rll}
\toprule
 movieID &                          title &                                  genre \\
\midrule
    8361 & Day After Tomorrow, The (2004) & Action|Adventure|Drama|Sci-Fi|Thriller \\
   48774 &         Children of Men (2006) & Action|Adventure|Drama|Sci-Fi|Thriller \\
   52328 &                Sunshine (2007) &        Adventure|Drama|Sci-Fi|Thriller \\
   58025 &                  Jumper (2008) & Action|Adventure|Drama|Sci-Fi|Thriller \\
   91500 &        The Hunger Games (2012) & Action|Adventure|Drama|Sci-Fi|Thriller \\
\bottomrule
\end{tabular}



In [195]:
def user14():
    R_I2 = pd.merge(R,I,how="inner",on="movieID")
    del R_I2["timestamp"]
    R_I2 = R_I2[R_I2.userID==14]
    del R_I2["userID"]
    return R_I2
logging.info(user14().to_latex(index=False))

INFO:root:\begin{tabular}{rrll}
\toprule
 movieID &  rating &                                              title &                                       genre \\
\midrule
    1721 &     0.0 &                                     Titanic (1997) &                               Drama|Romance \\
    2716 &     0.0 &         Ghostbusters (a.k.a. Ghost Busters) (1984) &                        Action|Comedy|Sci-Fi \\
     594 &    -2.0 &             Snow White and the Seven Dwarfs (1937) &    Animation|Children|Drama|Fantasy|Musical \\
    1196 &     1.0 & Star Wars: Episode V - The Empire Strikes Back ... &                     Action|Adventure|Sci-Fi \\
    2628 &     0.0 &   Star Wars: Episode I - The Phantom Menace (1999) &                     Action|Adventure|Sci-Fi \\
    2683 &    -1.0 &       Austin Powers: The Spy Who Shagged Me (1999) &                     Action|Adventure|Comedy \\
    2355 &    -1.0 &                               Bug's Life, A (1998) &         Adventure|Animation|C

In [197]:
user14_ratings = (user14().rating)
logging.info("user 14 scaled ratings: mean: %s, median: %s",np.mean(user14_ratings), np.median(user14_ratings))
logging.info("user 14 positive: %s",len(user14_ratings[user14_ratings>0]))
logging.info("user 14 negative: %s",len(user14_ratings[user14_ratings<0]))
logging.info("user 14 neutral: %s",len(user14_ratings[user14_ratings==0]))
user14_ratings = (user14().rating+3)
logging.info("user 14 ratings: mean: %s, median: %s",np.mean(user14_ratings), np.median(user14_ratings))

INFO:root:user 14 scaled ratings: mean: -0.05, median: 0.0
INFO:root:user 14 positive: 5
INFO:root:user 14 negative: 6
INFO:root:user 14 neutral: 9
INFO:root:user 14 ratings: mean: 2.95, median: 3.0


In [251]:
five_rec_14 = {}
for i in set(data.movieID):
    five_rec_14[i] = dot_part2(i,14)
five_rec_14 = {k: v for k, v in sorted(five_rec_14.items(), key=lambda item: -item[1])}

In [252]:
for i in sorted(five_rec_14.items(), key=lambda item: -item[1])[:5]:
    logging.info("movieID: %s, recomdation score: %s",i[0],i[1])

INFO:root:movieID: 2311, recomdation score: 1.7320508075688774
INFO:root:movieID: 2526, recomdation score: 1.7320508075688774
INFO:root:movieID: 2661, recomdation score: 1.7320508075688774
INFO:root:movieID: 3354, recomdation score: 1.7320508075688774
INFO:root:movieID: 3780, recomdation score: 1.7320508075688774


In [253]:

df = I[(I.movieID==2311)|(I.movieID==2526)|(I.movieID==2661)|(I.movieID==3354)|(I.movieID==3780)]

logging.info(df.to_latex(index=False))

INFO:root:\begin{tabular}{rll}
\toprule
 movieID &                                 title &  genre \\
\midrule
    2311 & 2010: The Year We Make Contact (1984) & Sci-Fi \\
    2526 &                         Meteor (1979) & Sci-Fi \\
    2661 &       It Came from Outer Space (1953) & Sci-Fi \\
    3354 &                Mission to Mars (2000) & Sci-Fi \\
    3780 &                 Rocketship X-M (1950) & Sci-Fi \\
\bottomrule
\end{tabular}



In [263]:
def calculate_IDF():
    logging.debug("IDF")
    idf = pd.DataFrame()
    for i, d in data.groupby("genre"):
        idf = idf.append(pd.DataFrame({"idf":[1/len(set(d.movieID))],"genre":[i]}))
    return idf
idf_dict = calculate_IDF()
logging.debug(idf_dict.to_latex(index=False))

In [268]:
def calculate_IDF_dict():
    logging.debug("IDF")
    idf = {}
    for i, d in data.groupby("genre"):
        idf[i] = 1/len(set(d.movieID))
    return idf
idf_dict = calculate_IDF_dict()

In [269]:
idf_dict

{'(no genres listed)': 0.058823529411764705,
 'Action': 0.0006480881399870382,
 'Adventure': 0.0008960573476702509,
 'Animation': 0.0022371364653243847,
 'Children': 0.001718213058419244,
 'Comedy': 0.0003023888720895071,
 'Crime': 0.0009157509157509158,
 'Documentary': 0.002053388090349076,
 'Drama': 0.0002310536044362292,
 'Fantasy': 0.0015313935681470138,
 'Film-Noir': 0.008264462809917356,
 'Horror': 0.0011467889908256881,
 'IMAX': 0.006535947712418301,
 'Musical': 0.0025380710659898475,
 'Mystery': 0.00186219739292365,
 'Romance': 0.0006489292667099286,
 'Sci-Fi': 0.0012642225031605564,
 'Thriller': 0.0005824111822947001,
 'War': 0.00273224043715847,
 'Western': 0.005952380952380952}

In [271]:
def dot_part3(movieId,userID):
    data = create_movie_per_genre()
    user = data[data.userID==userID]
    test = pd.DataFrame()
    for i,d in user.iterrows():
        len_genres = len(list(set(user[user.movieID==d.movieID].genre)))
        d.rating = (d.rating ) /np.sqrt(len_genres)
        test = test.append(d)
    u = test.groupby("genre").rating.sum()
    genres = list(set(data[data.movieID==movieId].genre))
    logging.debug(u)
    for genre in possible_genre:
        if genre not in genres:
            u[genre] = 0
        if genre in u:
            u[genre] *= idf_dict[genre]*1/np.sqrt(len(genres))
    return u.sum()
logging.info(dot_part3(5882,526))

INFO:root:0.01807229480044221
