In [2]:
library(igraph)

In [3]:
version

               _                           
platform       x86_64-apple-darwin15.6.0   
arch           x86_64                      
os             darwin15.6.0                
system         x86_64, darwin15.6.0        
status                                     
major          3                           
minor          4.4                         
year           2018                        
month          03                          
day            15                          
svn rev        74408                       
language       R                           
version.string R version 3.4.4 (2018-03-15)
nickname       Someone to Lean On          

In [4]:
load("Q12data.RData")
# g <- the graph of movies in part 2
# communities <- the communities result in part 2
# actorId2pagerank <- actor's pagerank, e.g. pagerank of actor 0 = actorId2pagerank[1]
# movieId2actorIds <- movies' actor list, e.g. actor list of movie 0 = movieId2actorIds[[1]]

In [5]:
movie_id_targets <- c(10321, 39182, 78995)
movie_name_targets <- c("Batman v Superman: Dawn of Justice (2016)", "Mission: Impossible - Rogue Nation (2015)", "Minions (2015)")

### Q13
actor weight: the average of the top 5 ratings of the actor's movies

features: top 5 actor weights for each movie

model: average the features

In [None]:
actor_ratings <- list()
for (i in 1:vcount(g)){
    if (is.na(V(g)$rating[i])) { next }
    
    movie_id <- as.numeric(V(g)$name[i])
    movie_rating <- V(g)$rating[i]    
    actors <- movieId2actorIds[[movie_id+1]]
    for (actorId in actors) {
        if (length(actor_ratings) <= actorId) {
            actor_ratings[[actorId+1]] <- c(movie_rating)
        }
        else {
            actor_ratings[[actorId+1]] <- c(actor_ratings[[actorId+1]], movie_rating)
        }
    }
}

actor_weight <- c()
for (ratings in actor_ratings) {
    if (is.null(ratings)) {
        weight <- 0
    }
    else {
        weight <- mean(sort(ratings, decreasing=TRUE)[1:5], na.rm=TRUE)
    }
    actor_weight <- c(actor_weight, weight)
}

In [27]:
predict_rating <- function(movieId) {
    actors <- movieId2actorIds[[movieId+1]]
    weights <- c()
    for (actorId in actors) {
        weights <- c(weights, actor_weight[actorId+1])
    }
    return(mean(sort(weights, decreasing=TRUE)[1:5], na.rm=TRUE))
}

In [28]:
truth <- c()
prediction <- c()
for (i in 1:vcount(g)){
    if (is.na(V(g)$rating[i])) { next }
    
    movie_id <- as.numeric(V(g)$name[i])
    movie_rating <- V(g)$rating[i]
    truth <- c(truth, movie_rating)
    prediction <- c(prediction, predict_rating(movie_id))
}

In [29]:
cat("RMSE:", sqrt(mean((truth - prediction)^2)))

RMSE: 2.10236

In [30]:
for (i in 1:3) {
    movie_id <- movie_id_targets[i]
    cat(movie_name_targets[i], '\n')
    node_id <- which(V(g)$name == movie_id)
    rating_truth <- V(g)$rating[node_id]
    cat("Ground truth rating:", rating_truth, '\n')
    cat("Predicted rating:", predict_rating(movie_id), '\n', '\n')
}

Batman v Superman: Dawn of Justice (2016) 
Ground truth rating: NA 
Predicted rating: 8.511 
 
Mission: Impossible - Rogue Nation (2015) 
Ground truth rating: NA 
Predicted rating: 8.26 
 
Minions (2015) 
Ground truth rating: NA 
Predicted rating: 9.212 
 


model: linear regression

In [34]:
get_features <- function(movieId) {
    actors <- movieId2actorIds[[movieId+1]]
    weights <- c()
    for (actorId in actors) {
        weights <- c(weights, actor_weight[actorId+1])
    }
    return(sort(weights, decreasing=TRUE)[1:5])
}

df <- data.frame(matrix(ncol = 6, nrow = 0))
colnames(df) <- c("rating","weight1","weight2","weight3","weight4","weight5")

for (i in 1:vcount(g)){
    if (is.na(V(g)$rating[i])) { next }
    
    movie_id <- as.numeric(V(g)$name[i])
    movie_rating <- V(g)$rating[i]
    df[nrow(df)+1, ] <- c(movie_rating, get_features(movie_id))
}

mod <- lm(rating ~ ., data = df)
summary(mod)


Call:
lm(formula = rating ~ ., data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-6.1976 -0.7230  0.1180  0.8453  4.4090 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1.493274   0.055214  27.045  < 2e-16 ***
weight1     -0.049843   0.013176  -3.783 0.000155 ***
weight2      0.003408   0.022975   0.148 0.882072    
weight3      0.154383   0.027220   5.672 1.42e-08 ***
weight4      0.286966   0.025775  11.134  < 2e-16 ***
weight5      0.208258   0.014958  13.922  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.187 on 133313 degrees of freedom
Multiple R-squared:  0.0927,	Adjusted R-squared:  0.09266 
F-statistic:  2724 on 5 and 133313 DF,  p-value: < 2.2e-16


In [35]:
RSS <- c(crossprod(mod$residuals))
MSE <- RSS / length(mod$residuals)
RMSE <- sqrt(MSE)
cat("RMSE:",RMSE)

RMSE: 1.186899

In [36]:
for (i in 1:3) {
    movie_id <- movie_id_targets[i]
    cat(movie_name_targets[i], '\n')
    
    node_id <- which(V(g)$name == movie_id)
    rating_truth <- V(g)$rating[node_id]
    cat("Ground truth rating:", rating_truth, '\n')
    
    features <- data.frame(matrix(ncol = 5, nrow = 1))
    colnames(features) <- c("weight1","weight2","weight3","weight4","weight5")
    features[1, ] <- get_features(movie_id)
    rating_prediction <- predict(mod, features)
    cat("Predicted rating:", rating_prediction, '\n', '\n')
}

Batman v Superman: Dawn of Justice (2016) 
Ground truth rating: NA 
Predicted rating: 6.544315 
 
Mission: Impossible - Rogue Nation (2015) 
Ground truth rating: NA 
Predicted rating: 6.458406 
 
Minions (2015) 
Ground truth rating: NA 
Predicted rating: 6.917641 
 
