# Recommenders / Collaborative filtering
*Kája Trachtová, Michaela Kecskésová, Martin Špilar, Dagmar Al Tukmachi*

+ goal of this assignment was to train a recommender system
+ input data is a table with `N` rows and `M` collumns where `N` is the number of the people evaluating tweets and `M` is the number of the tweets in the dataset
+ the score scale goes from 1 (worst) to 7 (best), `NA` is used for the missing values (if the person did not evaluated the tweet)
+ the output should be the table of the same size where `NAs` are replaced by the estimates of the score

## Load libraries

In [2]:
library(keras)
library(tidyverse)
library(glue)
library(data.table)

## Read and transform input data

In [3]:
table <- read_csv("/kaggle/input/tweet-ratings/fake_v1_100x252.csv")

“Missing column names filled in: 'X1' [1]”

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────[39m
cols(
  .default = col_double(),
  X1 = [31mcol_character()[39m
)
[36mℹ[39m Use [38;5;235m[48;5;253m[38;5;235m[48;5;253m`spec()`[48;5;253m[38;5;235m[49m[39m for the full column specifications.




+ transform the table from wide to long format so that both users and tweets are separate columns

In [4]:
setDT(table)
table_long <- melt(table, id = 1, measure=patterns("^tweet"), value.name = c("rating"), 
     variable.name = "tweet")
head(table_long)

X1,tweet,rating
<chr>,<fct>,<dbl>
user1,tweet1,4.0
user2,tweet1,
user3,tweet1,
user4,tweet1,4.0
user5,tweet1,
user6,tweet1,


+ recode strings with users and tweets to integers

In [5]:
table_long <- table_long %>%
    mutate(user_id = as.numeric(factor(table_long$X1)), tweet_id = as.numeric(factor(table_long$tweet))) %>%
    mutate(user_id = user_id - 1, tweet_id = tweet_id - 1)

head(table_long)

X1,tweet,rating,user_id,tweet_id
<chr>,<fct>,<dbl>,<dbl>,<dbl>
user1,tweet1,4.0,0,0
user2,tweet1,,12,0
user3,tweet1,,23,0
user4,tweet1,4.0,34,0
user5,tweet1,,45,0
user6,tweet1,,56,0


+ remove NA values

In [7]:
table_final <- na.omit(table_long)
head(table_final)

X1,tweet,rating,user_id,tweet_id
<chr>,<fct>,<dbl>,<dbl>,<dbl>
user1,tweet1,4,0,0
user4,tweet1,4,34,0
user18,tweet1,3,10,0
user22,tweet1,4,15,0
user30,tweet1,5,24,0
user43,tweet1,5,38,0


In [8]:
n_tweets <- n_distinct(table_final$tweet)
n_users <- n_distinct(table_final$X1)

glue("This dataset includes {nrow(table_final)} ratings by {n_users} users on {n_tweets} unique tweets")

## Basic model

In [9]:
set.seed(123)

x_train <- table_final %>% select(c(user_id, tweet_id)) %>% as.matrix()
y_train <- table_final %>% pull(rating)

head(x_train)
head(y_train)

user_id,tweet_id
0,0
34,0
10,0
15,0
24,0
38,0


In [10]:
embedding_dim <- 32

# input layers
input_users <- layer_input(shape = 1, name = "users")
input_tweets <- layer_input(shape = 1, name = "tweets")

user_embeddings <- input_users %>% 
  layer_embedding(
    input_dim = n_users,
    output_dim = embedding_dim,
    name = "user_embeddings"
  ) 

tweet_embeddings <- input_tweets %>% 
  layer_embedding(
    input_dim = n_tweets,
    output_dim = embedding_dim,
    name = "tweet_embeddings"
  )

In [11]:
dot <- layer_dot(
  inputs = list(user_embeddings, tweet_embeddings),
  axes = 2,
  name = "dot_product"
  )

pred <- dot %>% layer_dense(
  units = 1, 
  activation = "relu",
  name = "rating_prediction"
  )

In [12]:
# define model inputs/outputs
model <- keras_model(inputs = c(input_users, input_tweets), outputs = pred)

model %>% compile(
  optimizer = "rmsprop",
  loss = "mse",
  metric = "mae"
)

# inspect model
summary(model)

Model: "functional_1"
________________________________________________________________________________
Layer (type)              Output Shape      Param #  Connected to               
users (InputLayer)        [(None, 1)]       0                                   
________________________________________________________________________________
tweets (InputLayer)       [(None, 1)]       0                                   
________________________________________________________________________________
user_embeddings (Embeddin (None, 1, 32)     3200     users[0][0]                
________________________________________________________________________________
tweet_embeddings (Embeddi (None, 1, 32)     8064     tweets[0][0]               
________________________________________________________________________________
dot_product (Dot)         (None, 1, 1)      0        user_embeddings[0][0]      
                                                     tweet_embeddings[0][0]     
______

In [13]:
# train the model
history <- model %>% fit(
  x = list(
    x_train[, "user_id", drop = FALSE],
    x_train[, "tweet_id", drop = FALSE]
  ),
  y = y_train,
  epochs = 30,
  batch_size = 16, 
  validation_split = 0.2,
  callbacks = list(callback_early_stopping(patience = 2))
)

In [14]:
best_epoch <- which(history$metrics$val_loss == min(history$metrics$val_loss))
loss <- history$metrics$val_loss[best_epoch] %>% round(3)
mae <- history$metrics$val_mae[best_epoch] %>% round(3)

glue("The best epoch had a loss of {loss} and mean absolute error of {mae}")

## Model with bias

In [15]:
# input layers
input_users <- layer_input(shape = 1, name = "users")
input_tweets <- layer_input(shape = 1, name = "tweets")

user_embeddings <- input_users %>%
  layer_embedding(
    input_dim = n_users,
    output_dim = embedding_dim,
    name = "user_embeddings"
  )

tweet_embeddings <- input_tweets %>%
  layer_embedding(
    input_dim = n_tweets,
    output_dim = embedding_dim,
    name = "tweet_embeddings"
  )

user_bias <- input_users %>%
  layer_embedding(
    input_dim = n_users,
    output_dim = 1,
    name = "user_bias"
  ) 

tweet_bias <- input_users %>%
  layer_embedding(
    input_dim = n_tweets,
    output_dim = 1,
    name = "tweet_bias"
  )


In [16]:
dot <- layer_dot(list(user_embeddings, tweet_embeddings), axes = 2, 
                 name = "dot_product")

dot_bias <- layer_add(list(dot, user_bias, tweet_bias), name = "add_bias")

pred <- dot_bias %>% layer_dense(units = 1, activation = "relu", 
                                 name = "rating_prediction")

In [17]:
# define model inputs/outputs
model_bias <- keras_model(inputs = c(input_users, input_tweets), outputs = pred)

model_bias %>% compile(
  optimizer = "rmsprop",
  loss = "mse",
  metric = "mae"
)

# inspect model
summary(model_bias)

Model: "functional_3"
________________________________________________________________________________
Layer (type)              Output Shape      Param #  Connected to               
users (InputLayer)        [(None, 1)]       0                                   
________________________________________________________________________________
tweets (InputLayer)       [(None, 1)]       0                                   
________________________________________________________________________________
user_embeddings (Embeddin (None, 1, 32)     3200     users[0][0]                
________________________________________________________________________________
tweet_embeddings (Embeddi (None, 1, 32)     8064     tweets[0][0]               
________________________________________________________________________________
dot_product (Dot)         (None, 1, 1)      0        user_embeddings[0][0]      
                                                     tweet_embeddings[0][0]     
______

In [19]:
# train the model
history_bias <- model_bias %>% fit(
  x = list(
    x_train[, "user_id", drop = FALSE],
    x_train[, "tweet_id", drop = FALSE]
  ),
  y = y_train,
  epochs = 30,
  batch_size = 16, 
  validation_split = 0.2,
  callbacks = list(callback_early_stopping(patience = 2))
)

In [20]:
best_epoch <- which(history_bias$metrics$val_loss == min(history_bias$metrics$val_loss))
loss <- history_bias$metrics$val_loss[best_epoch] %>% round(3)
mae <- history_bias$metrics$val_mae[best_epoch] %>% round(3)

glue("The best epoch had a loss of {loss} and mean absolute error of {mae}")

## Extract predicted user ratings

+ create function that will, for given user, extract rating predictions for un-rated tweets

In [21]:
# user_id = user_id - 1 so that it aligns to our zero-based user IDs
# table_full = table_long
# table_final = table_final
# model to use for predictions
predict_rating <- function(user_id, table_full, table_final, model){
    new_user_id <- user_id
    
    # get tweets rated by our user
    tweets_rated <- table_final %>%
      filter(user_id == new_user_id) %>% 
      pull(tweet_id)
    
    # get all available tweets
    all_tweets <- table_full %>% 
      distinct(tweet_id) %>%
      pull()
    
    # identify movies not watched
    tweets_not_rated <- setdiff(all_tweets, tweets_rated)
    
    tweet_options <- table_full %>%
      filter(tweet_id %in% tweets_not_rated) %>%
      distinct(tweet_id, tweet)
    
    customer_options <- expand.grid(
      user_id = new_user_id, 
      tweet_id = tweets_not_rated
      ) %>%
      as.matrix()
    
    inputs <- list(
      customer_options[, "user_id", drop = FALSE],
      customer_options[, "tweet_id", drop = FALSE]
      )

    pred <- model %>% predict(inputs)
    
    result <- customer_options %>%
      as_tibble() %>%
      mutate(predictions = as.vector(pred)) %>%
      left_join(tweet_options, by = "tweet_id") %>%
      arrange(desc(predictions))
    
    return(result)
}

+ iterate through all users and obtain table with rating predictions

In [23]:
# get all users
all_users <- unique(table_long$user_id)

# loop through users and get rating predictions
datalist = list()
for(i in all_users){
    test <- predict_rating(i, table_long, table_final, model)
    test$user <- unique(table_long[table_long$user_id == i,]$X1)
    datalist[[i+1]] <- test
}

In [24]:
# unlist predictions into one big table
prediction_data = do.call(rbind, datalist)
colnames(prediction_data) <- c("user_id","tweet_id", "rating","tweet", "X1")

In [25]:
head(prediction_data)
head(table_final)

user_id,tweet_id,rating,tweet,X1
<dbl>,<dbl>,<dbl>,<fct>,<chr>
0,71,5.596436,tweet72,user1
0,70,5.497181,tweet71,user1
0,158,5.277966,tweet159,user1
0,51,5.172799,tweet52,user1
0,165,5.117799,tweet166,user1
0,116,5.05956,tweet117,user1


X1,tweet,rating,user_id,tweet_id
<chr>,<fct>,<dbl>,<dbl>,<dbl>
user1,tweet1,4,0,0
user4,tweet1,4,34,0
user18,tweet1,3,10,0
user22,tweet1,4,15,0
user30,tweet1,5,24,0
user43,tweet1,5,38,0


+ merge original table (NAs removed) with predictions

In [28]:
full_table <- rbind(as.data.frame(prediction_data), as.data.frame(table_final))

In [29]:
# remove uneccessary columns
full_table <- full_table %>%
    select(-user_id, -tweet_id)

+ finally, transform table with all ratings to wide format
+ to make it comparable with the original table, rename and reorder columns

In [31]:
full_table <- reshape(full_table, idvar = "X1", timevar = "tweet", direction = "wide")
full_table <- full_table[ , sort(names(full_table))]

new_colnames <- sort(colnames(full_table))
new_colnames <- new_colnames[-length(new_colnames)]
new_colnames <- gsub(new_colnames, pattern = "rating.", replacement = "")
new_colnames <- c(new_colnames,"X1")
colnames(full_table) <- new_colnames

full_table <- full_table %>%
  select(X1, everything())

In [32]:
full_table[1:11,1:11]

Unnamed: 0_level_0,X1,tweet1,tweet10,tweet100,tweet101,tweet102,tweet103,tweet104,tweet105,tweet106,tweet107
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,user1,4.0,5.037394,4.089828,4.329518,4.368783,4.073334,5.0,4.0,4.936341,4.0
203,user10,3.48809,4.613604,4.0,4.0,3.951655,3.839702,4.478663,4.0,4.631041,4.497468
405,user100,4.085387,4.458745,4.169681,3.758139,4.670711,4.0,4.247487,4.0,3.792134,4.415625
607,user11,4.203773,3.0,4.079377,5.0,5.0,3.822865,3.658921,3.84127,3.53,3.384495
809,user12,3.759059,4.685096,4.094869,3.988541,5.0,4.0,4.670983,4.024569,3.837465,4.838521
1011,user13,4.839155,3.878043,4.248652,3.44969,4.691702,3.908241,4.0,4.045667,3.269952,4.253299
1213,user14,4.077003,4.258509,3.918088,3.804352,4.0,3.861645,4.0,4.03478,4.093985,3.849784
1415,user15,4.184552,4.004817,3.952168,3.731057,3.922584,3.682425,3.822258,3.883625,4.0,3.500751
1617,user16,3.893927,4.398456,4.111246,3.939642,4.249205,3.927216,4.0,4.032085,4.541065,3.240675
1819,user17,3.794752,5.0,3.946935,4.0,4.008647,3.852845,4.204991,5.0,4.229907,3.647207


In [225]:
table[1:21,1:11]

X1,tweet1,tweet2,tweet3,tweet4,tweet5,tweet6,tweet7,tweet8,tweet9,tweet10
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
user1,4.0,,,,,,,,,
user2,,,,5.0,,5.0,,,,
user3,,4.0,,,,,4.0,,,
user4,4.0,,,,,4.0,,4.0,,4.0
user5,,,5.0,,,,,,,
user6,,,,,,,,4.0,,
user7,,,,,,,,,,
user8,,,,,4.0,,4.0,,,
user9,,,5.0,,,,,,,
user10,,,,4.0,,4.0,,,,
