# Data Process Trial

In [88]:
import os
import csv
import sys
import re

from surprise import Dataset
from surprise import Reader

from collections import defaultdict
import numpy as np
import pandas as pd
import time


In [145]:
ratingsPath = './ml-latest-small/ratings.csv'
moviesPath = './ml-latest-small/movies.csv'
os.path.exists(ratingsPath)
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
ratingsDataset = Dataset.load_from_file('./ml-latest-small/ratings.csv', reader=reader)

True

In [126]:
def timing_func(func):
    def wrapper(*args, **kwargs):
        t1 = time.time()
        func(*args, **kwargs)
        t2 = time.time()
        print(f"--- {str((t2 - t1))} seconds ---")
        return func(*args, **kwargs)
    return wrapper

# get user ratings
* getUserRatings() is faster than using pandas


0.02 vs 0.07 s

In [128]:
@timing_func
def getUserRatings(user):
        userRatings = []
        hitUser = False
        with open(ratingsPath, newline='') as csvfile:
            ratingReader = csv.reader(csvfile)
            next(ratingReader)
            for row in ratingReader:
                userID = int(row[0])
                if (user == userID):
                    movieID = int(row[1])
                    rating = float(row[2])
                    userRatings.append((movieID, rating))
                    hitUser = True
                if (hitUser and (user != userID)):
                    break

        return userRatings
@timing_func
def getUserRatings_pd(user):
    start_time = time.time()
    userRatings = pd.read_csv(ratingsPath)
    u = userRatings[userRatings['userId'] == user]
    u_ratings = u[['movieId','rating']].astype((int,float)).values.tolist()
    return u_ratings


# start_time = time.time()
a = getUserRatings(85) 
# print(f"--- {str((time.time() - start_time))} seconds ---")

b = getUserRatings_pd(85)

--- 0.014616966247558594 seconds ---
--- 0.05928325653076172 seconds ---


# get PopularityRanks
* faster than using pandas

In [140]:
@timing_func
def getPopularityRanks():
    ratings = defaultdict(int)
    rankings = defaultdict(int)
    with open(ratingsPath, newline='') as csvfile:
        ratingReader = csv.reader(csvfile)
        next(ratingReader)
        for row in ratingReader:
            movieID = int(row[1])
            ratings[movieID] += 1
    rank = 1
    for movieID, ratingCount in sorted(ratings.items(), key=lambda x: x[1], reverse=True):
        rankings[movieID] = rank
        rank += 1
    return rankings

getPopularityRanks()

@timing_func
def getPopularityRanks_pd():
    ratings = pd.read_csv(ratingsPath)
    movie_rating_count = ratings.groupby('movieId').count().sort_values('rating', ascending = False)['rating'].to_dict()
    return movie_rating_count

getPopularityRanks_pd()

--- 0.15262794494628906 seconds ---


defaultdict(int,
            {356: 1,
             296: 2,
             318: 3,
             593: 4,
             260: 5,
             480: 6,
             2571: 7,
             1: 8,
             527: 9,
             589: 10,
             1196: 11,
             110: 12,
             1270: 13,
             608: 14,
             2858: 15,
             1198: 16,
             780: 17,
             1210: 18,
             588: 19,
             457: 20,
             590: 21,
             2959: 22,
             47: 23,
             50: 24,
             150: 25,
             364: 26,
             858: 27,
             4993: 28,
             380: 29,
             592: 30,
             32: 31,
             2762: 32,
             2028: 33,
             1580: 34,
             5952: 35,
             377: 36,
             595: 37,
             7153: 38,
             344: 39,
             4306: 40,
             648: 41,
             1265: 42,
             1721: 43,
             1197: 44,
            

--- 0.09530496597290039 seconds ---


{356: 341,
 296: 324,
 318: 311,
 593: 304,
 260: 291,
 480: 274,
 2571: 259,
 1: 247,
 527: 244,
 589: 237,
 1196: 234,
 110: 228,
 1270: 226,
 608: 224,
 2858: 220,
 1198: 220,
 780: 218,
 1210: 217,
 588: 215,
 457: 213,
 590: 202,
 2959: 202,
 47: 201,
 50: 201,
 364: 200,
 858: 200,
 150: 200,
 4993: 200,
 380: 198,
 592: 196,
 32: 196,
 2762: 193,
 2028: 191,
 1580: 190,
 5952: 188,
 377: 180,
 595: 176,
 7153: 176,
 344: 175,
 4306: 174,
 648: 168,
 1265: 165,
 1721: 164,
 1197: 163,
 3578: 161,
 1097: 160,
 231: 158,
 1240: 158,
 1704: 157,
 367: 157,
 500: 153,
 1036: 151,
 736: 150,
 1073: 148,
 34: 148,
 1291: 147,
 2716: 147,
 597: 147,
 541: 146,
 1136: 145,
 316: 145,
 1193: 144,
 165: 142,
 6539: 141,
 2628: 138,
 1682: 137,
 733: 135,
 1221: 135,
 5349: 134,
 2918: 133,
 1089: 132,
 293: 132,
 4226: 132,
 1213: 131,
 4963: 130,
 4886: 130,
 586: 129,
 153: 129,
 1214: 127,
 3793: 127,
 587: 126,
 2997: 126,
 8961: 126,
 539: 125,
 1200: 125,
 1617: 125,
 253: 125,
 3114

In [138]:
ratings = pd.read_csv(ratingsPath)
ratings.head()
ratings.groupby('movieId').count().sort_values('rating', ascending = False)['rating'].to_dict()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


{356: 341,
 296: 324,
 318: 311,
 593: 304,
 260: 291,
 480: 274,
 2571: 259,
 1: 247,
 527: 244,
 589: 237,
 1196: 234,
 110: 228,
 1270: 226,
 608: 224,
 2858: 220,
 1198: 220,
 780: 218,
 1210: 217,
 588: 215,
 457: 213,
 590: 202,
 2959: 202,
 47: 201,
 50: 201,
 364: 200,
 858: 200,
 150: 200,
 4993: 200,
 380: 198,
 592: 196,
 32: 196,
 2762: 193,
 2028: 191,
 1580: 190,
 5952: 188,
 377: 180,
 595: 176,
 7153: 176,
 344: 175,
 4306: 174,
 648: 168,
 1265: 165,
 1721: 164,
 1197: 163,
 3578: 161,
 1097: 160,
 231: 158,
 1240: 158,
 1704: 157,
 367: 157,
 500: 153,
 1036: 151,
 736: 150,
 1073: 148,
 34: 148,
 1291: 147,
 2716: 147,
 597: 147,
 541: 146,
 1136: 145,
 316: 145,
 1193: 144,
 165: 142,
 6539: 141,
 2628: 138,
 1682: 137,
 733: 135,
 1221: 135,
 5349: 134,
 2918: 133,
 1089: 132,
 293: 132,
 4226: 132,
 1213: 131,
 4963: 130,
 4886: 130,
 586: 129,
 153: 129,
 1214: 127,
 3793: 127,
 587: 126,
 2997: 126,
 8961: 126,
 539: 125,
 1200: 125,
 1617: 125,
 253: 125,
 3114

# get Movie Name - ID match

In [7]:
movieID_to_name = {}
name_to_movieID = {}

In [9]:
with open('movies.csv', newline='', encoding='ISO-8859-1') as csvfile:
    movieReader = csv.reader(csvfile)
    next(movieReader) # skip header line
    for row in movieReader:
        movieID = int(row[0])
        movieName = row[1]
        movieID_to_name[movieID] = movieName
        name_to_movieID[movieName] = movieID

['movieId', 'title', 'genres']

# get Genres

In [142]:
@timing_func
def getGenres():
    genres = defaultdict(list)
    genreIDs = {}
    maxGenreID = 0
    with open(moviesPath, newline='', encoding='ISO-8859-1') as csvfile:
        movieReader = csv.reader(csvfile)
        next(movieReader)  #Skip header line
        for row in movieReader:
            movieID = int(row[0])
            genreList = row[2].split('|')
            genreIDList = []
            for genre in genreList:
                if genre in genreIDs:
                    genreID = genreIDs[genre]
                else:
                    genreID = maxGenreID
                    genreIDs[genre] = genreID
                    maxGenreID += 1
                genreIDList.append(genreID)
            genres[movieID] = genreIDList
    # Convert integer-encoded genre lists to bitfields that we can treat as vectors
    for (movieID, genreIDList) in genres.items():
        bitfield = [0] * maxGenreID
        for genreID in genreIDList:
            bitfield[genreID] = 1
        genres[movieID] = bitfield

    return genres
getGenres()


--- 0.038099050521850586 seconds ---


defaultdict(list,
            {1: [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             2: [1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             3: [0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             4: [0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             5: [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             6: [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             7: [0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             8: [1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             9: [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             10: [1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             11: [0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             12: [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             13: [1, 1, 1, 0, 0, 0, 0, 0, 0

In [41]:
with open('movies.csv', newline='', encoding='ISO-8859-1') as csvfile:
    movieReader = csv.reader(csvfile)
    next(movieReader)
    for row in movieReader:
        movieID = int(row[0])
        genreList = row[2].split('|')
        genreIDList = []
        for genre in genreList:
            if genre in genreIDs:
                genreID = genreIDs[genre]
            else:
                genreID = maxGenreID
                maxGenreID += 1
                genreIDs[genre] = genreID
            genreIDList.append(genreID)
        genres[movieID] = genreIDList
# genreIDs

['movieId', 'title', 'genres']

In [43]:
# get genres matrix so we can treat genres[movie] as vector
for (movieID, genreIDlist) in genres.items():
    bitfield = [0] * maxGenreID
    for genreID in genreIDlist:
        bitfield[genreID] = 1
    genres[movieID] = bitfield
    


# get Years

In [146]:
years = defaultdict(int)
p = re.compile(r"(?:\((\d{4})\))?\s*$")

In [149]:
with open(moviesPath, newline = '', encoding = 'ISO-8859-1') as csvfile:
    movieReader = csv.reader(csvfile)
    next(movieReader)
    for row in movieReader:
        movieID = int(row[0])
        title = row[1]
#         print(f'title = {title}')
        m = p.search(title)
        year = m.group(1)
        if year : 
            years[movieID] = int(year)
years

['movieId', 'title', 'genres']

defaultdict(int,
            {1: 1995,
             2: 1995,
             3: 1995,
             4: 1995,
             5: 1995,
             6: 1995,
             7: 1995,
             8: 1995,
             9: 1995,
             10: 1995,
             11: 1995,
             12: 1995,
             13: 1995,
             14: 1995,
             15: 1995,
             16: 1995,
             17: 1995,
             18: 1995,
             19: 1995,
             20: 1995,
             21: 1995,
             22: 1995,
             23: 1995,
             24: 1995,
             25: 1995,
             26: 1995,
             27: 1995,
             28: 1995,
             29: 1995,
             30: 1995,
             31: 1995,
             32: 1995,
             34: 1995,
             35: 1995,
             36: 1995,
             37: 1995,
             38: 1995,
             39: 1995,
             40: 1995,
             41: 1995,
             42: 1995,
             43: 1995,
             44: 1995,
   