In [1]:
import re, sys
import math, random
import numpy as np
import operator

from pprint import pprint
from collections import namedtuple

In [2]:
def add_ratings(db, chunks, num):
    if not chunks[0] in db:
        db[chunks[0]] = {}
    db[chunks[0]][num] = int(chunks[2])

def read_files(db, num):
    movie_file = "movies/"+num
    ratings = []
    fo = open(movie_file, "r")
    r = 0
    for line in fo:
        chunks = re.split(",", line)
        chunks[len(chunks)-1] = chunks[len(chunks)-1].strip()
        add_ratings(db, chunks, num)

#### END----- functions to read movie files and create db ----- ####

In [45]:
def score(movies):
    '''
    Inputs: weights of movies, max rating per moive, auxiliary information, and a record, 
    Returns the corresponding score
    '''
    #### ----- your code here ----- ####

    movie_score = 0 
    
    supp = 12
        
    for movie in movies:
        w = movie.weight
        p = movie.p
        aux = movie.aux
        r = movie.rating
        id = movie.id
        
        if r is None:
            continue

        T = 1- (abs(aux - r)/p)
        #print(f"Calculating score for {id}, {movie_score}, {w*T/supp}")

        movie_score = movie_score + w*T/supp
   
    return movie_score



def compute_weights(db):
    '''
    Input: database of users
    Returns weights of all movies
    '''
    #### ----- your code here ----- ####

    ## you can use 10 base log
    
    # dict with movie_id as key and frequency as value
    movie_freq = {}
    for user, movies in db.items():
        for movie,ratings in movies.items():
            if not movie in movie_freq:
                movie_freq[movie]=1
            else:
                movie_freq[movie]=movie_freq[movie]+1
    
    weights = {}
    for movie, freq in movie_freq.items():
        weights[movie]=1/math.log10(freq)
        
    return weights
        

def compute_p(db, aux):
    '''
    Input: database of users
    Returns weights of all movies
    '''
    #### ----- your code here ----- ####

    ## you can use 10 base log
    
    # dict with movie_id as key and frequency as value
    movie_rating_range = {}
    for user, movies in db.items():
        for movie, rating in movies.items():
            if movie not in movie_rating_range:
                movie_rating_range[movie] = set()            
            movie_rating_range[movie].add(rating)
            
    movie_rating_p = {}
    for movie, rating_range in movie_rating_range.items():
        aux_rating = aux.get(movie, None)
        if aux_rating is not None:
            rating_range.add(aux_rating)
        movie_rating_p[movie] = (max(rating_range)- min(rating_range))
            
    return movie_rating_p
        

In [46]:
db = {}
files = ["03124", "06315", "07242", "16944", "17113",
        "10935", "11977", "03276", "14199", "08191",
        "06004", "01292", "15267", "03768", "02137"]

for file in files:
    read_files(db, file)

MovieInfo = namedtuple("MovieInfo", 'id, aux, rating, weight, p')

auxes = { '14199': 4.5, '17113': 4.2, '06315': 4.0, '01292': 3.3,
        '11977': 4.2, '15267': 4.2, '08191': 3.8, '16944': 4.2,
        '07242': 3.9, '06004': 3.9, '03768': 3.5, '03124': 3.5}

In [54]:
# Question a
weights = compute_weights(db)
print("Question 1a, Weights")
print("Movie ID, weight")
for movie, freq in weights.items():
    print(f"{movie}, {freq}")
    
ps = compute_p(db, auxes)


Question 1a, Weights
Movie ID, weight
03124, 0.27103391082217876
14199, 0.2711777534611824
06315, 0.2710861246194568
07242, 0.2712302590154106
17113, 0.2712697082686535
10935, 0.27122368998682783
11977, 0.2712894554827877
03768, 0.29119798994551205
02137, 0.29113029735230445
06004, 0.29118443894696805
08191, 0.2712960412389864
15267, 0.2914293175340726
03276, 0.2713553886055061
16944, 0.2711188119190368
01292, 0.29129302224118986
weights={'03124': 0.27103391082217876, '14199': 0.2711777534611824, '06315': 0.2710861246194568, '07242': 0.2712302590154106, '17113': 0.2712697082686535, '10935': 0.27122368998682783, '11977': 0.2712894554827877, '03768': 0.29119798994551205, '02137': 0.29113029735230445, '06004': 0.29118443894696805, '08191': 0.2712960412389864, '15267': 0.2914293175340726, '03276': 0.2713553886055061, '16944': 0.2711188119190368, '01292': 0.29129302224118986}


In [48]:
# Question b
similarity_scores = []
for user, movies in db.items():    
    movies_to_score = []
    for id, aux in auxes.items():
        p = ps.get(id)
        weight = weights.get(id)
        rating = movies.get(id)
        movie = MovieInfo(
            id=id, 
            aux=aux, 
            weight=weight, 
            rating=rating, 
            p=p
        )
        movies_to_score.append(movie)
    #print(user)
    similarity_scores.append((user, score(movies_to_score)))
    
    

In [49]:
similarity_scores = sorted(similarity_scores, key = lambda x: x[1], reverse=True)
print("\nQuestion 1b, top 5 similartity scores to aux")
print("User ID, score")
for simscore in similarity_scores[:5]:
    print(f"{simscore[0]}, {simscore[1]}")


Question 1b, top 5 similartity scores to aux
User ID, score
1664010, 0.26100839011711874
2118461, 0.2369836223572406
716173, 0.15853488799409274
2238060, 0.15536814320167633
16272, 0.14997099609163322


In [50]:
print("\nQuestion 1c, Ratings of user with highest similarity score")
print("Movie ID, Top User Rating, AUX")
for movie, aux in auxes.items():
    print(f"{movie}, {db.get(similarity_scores[0][0]).get(movie)}, {aux}")


Question 1c, Ratings of user with highest similarity score
Movie ID, Top User Rating, AUX
14199, 4, 4.5
17113, 4, 4.2
06315, 4, 4.0
01292, 3, 3.3
11977, 4, 4.2
15267, 4, 4.2
08191, 4, 3.8
16944, 4, 4.2
07242, 4, 3.9
06004, 4, 3.9
03768, 4, 3.5
03124, 4, 3.5


In [51]:
# Question 1d
def calculate_M(weights, auxes):
    supp=12
    M=0
    for movie_id, aux in auxes.items():
        M = M + weights[movie_id]/12
    return M

M = calculate_M(weights, auxes)

print("\nQuestion 1d")
print("a.")
y = .1
print(f"if y is {y}, M is {M}, the difference between highest and second "
      f"highest similarity score is {similarity_scores[0][1]-similarity_scores[1][1]}. "
     f"y*M is {y*M}. {similarity_scores[0][1]-similarity_scores[1][1]} > {y*M} is"
     f" {similarity_scores[0][1]-similarity_scores[1][1] > y*M}")


print("b.")
y = .05

print(f"if y is {y}, M is {M}, the difference between highest and second "
      f"highest similarity score is {similarity_scores[0][1]-similarity_scores[1][1]}. "
     f"y*M is {y*M}. {similarity_scores[0][1]-similarity_scores[1][1]} > {y*M} is"
     f" {similarity_scores[0][1]-similarity_scores[1][1] > y*M}")


Question 1d
a.
if y is 0.1, M is 0.2778839027912863, the difference between highest and second highest similarity score is 0.024024767759878135. y*M is 0.02778839027912863. 0.024024767759878135 > 0.02778839027912863 is False
b.
if y is 0.05, M is 0.2778839027912863, the difference between highest and second highest similarity score is 0.024024767759878135. y*M is 0.013894195139564315. 0.024024767759878135 > 0.013894195139564315 is True
