# User-User Filtering

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!mkdir Recommendations
!mount --bind /content/drive/My\ Drive/Recommendations /content/Recommendations/
%cd /content/Recommendations

/content/Recommendations


In [None]:
import sys
import json
import pandas as pd
import numpy as np
from statistics import mean
from time import sleep
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error

##Generate a set of User-User Weights
* Weights are calculated simply using a Pearson Correlation.
* 138K users gives a total of over 10^10 User-User combinations (too many)
* The reference user group will be narrowed down to a subset of users, roughly 10%
* Users in this subset should all come from a top percentage of most frequent users (percentage to be determined experimentally)
### Possibilities for Improvement
* Weight the user-user combinations based on number of common films as well as taste similarity


In [None]:
train_df, test_df, users = preprocess_data('rating.csv')

Reading CSV file
Getting most frequent users


In [None]:
subset_dim = 0.01

def get_most_frequent_movies(df, subset_size):

    ordered = df.groupby(['movieId']).agg(['count']).sort_values(('userId', 'count'), ascending=False)[('userId', 'count')]
    ordered_ids = ordered.index.values.tolist()
    return ordered_ids[:round(subset_size*len(ordered_ids))]  

def get_most_frequent_users(df, subset_size):
    
    ordered = df.groupby(['userId']).agg(['count']).sort_values(('movieId', 'count'), ascending=False)[('movieId', 'count')]
    ordered_ids = ordered.index.values.tolist()
    return ordered_ids[:round(subset_size*len(ordered_ids))]

def get_subset_data(df, users, movies):

    df = df.loc[df['userId'].isin(users)]
    df = df.loc[df['movieId'].isin(movies)]
    return df

def train_test_split(df, test_frac=0.2):
    
    test_df = df.sample(frac=test_frac).sort_values(['userId', 'movieId'])
    train_df = df[~df.isin(test_df)].dropna()
    return train_df, test_df  

def preprocess_data(rating_csv, subset_dim=0.01):

    print("Reading CSV file")
    df = pd.read_csv(rating_csv)

    print("Getting most frequent users")
    movies = get_most_frequent_movies(df, subset_dim)
    users = get_most_frequent_users(df, subset_dim)

    df = get_subset_data(df, users, movies)
    train_df, test_df = train_test_split(df)

    return train_df, test_df, users

def calculate_user_user_weights(rating_csv):
    
    train_df, test_df, users = preprocess_data(rating_csv)

    min_common_movies = 5

    print("Initializing empty weights matrix\n")
    weights = np.empty((len(users), len(users)))

    print(f"Building weights for {len(users)} users")

    for idx, i in enumerate(users):
        user_df = train_df.loc[train_df['userId'] == i]
        for jdx, j in enumerate(users):
            if i == j: continue
            ref_user_df = train_df.loc[train_df['userId'] == j]
            merged = user_df.merge(ref_user_df, how="inner", on="movieId")
            if merged["movieId"].size >= min_common_movies:
                r, p = pearsonr(merged["rating_x"].values, merged["rating_y"].values)
                weights[idx - 1, jdx - 1] = r
                if jdx % 10 == 0:
                    print(f"\rWeight {idx} {jdx}: {r}", sep=' ', end='', flush=True)
    
    return weights, test_df, users

def calculate_expected_ratings(df, test_df, user_list):

    train_df = df[~df.isin(test_df)].dropna()
    means_dict = {}
    for user in user_list:
        user_df = train_df.loc[train_df["userId"]==user]
        avg = mean(user_df["rating"].values.tolist())
        means_dict[int(user)] = avg
    
    return means_dict

In [None]:
weights, test_df, users = calculate_user_user_weights("rating.csv")

Reading CSV file
Getting most frequent users
Initializing empty weights matrix

Building weights for 1385 users
Weight 1384 1380: 0.37594442242359954

In [None]:
df = pd.read_csv("rating.csv")
movies = get_most_frequent_movies(df, subset_dim)
users = get_most_frequent_users(df, subset_dim)
df = get_subset_data(df, users, movies)

means_dict = calculate_expected_ratings(df, test_df, users)

In [None]:
np.save("users.npy", np.array(users))
np.save("weights.npy", weights)
test_df.to_csv("test_df.csv")

with open('means.json', 'w') as f:
    json.dump(means_dict, f)

## Perform prediction on the test data

In [None]:
def generate_user_movie_bags(df):

    user_movie_bags = {}
    for user in df["userId"].unique():
        movies = df.loc[df["userId"]==user]["movieId"].values.astype(int).tolist()
        user_movie_bags[user] = set(movies)

    return user_movie_bags

def get_similar_users(user_idx, movie, weight_sort, users_list, user_movie_bags,
                      num_similar_users=25):

    similar_users = []
    idxs_row = weight_sort[user_idx].tolist()
    while len(similar_users) < num_similar_users and len(idxs_row) > 0:

        if abs(idxs_row[-1]) >= abs(idxs_row[0]):
            next_user = idxs_row.pop(-1)
        else:
            next_user = idxs_row.pop(0)

        if movie in user_movie_bags[users_list[next_user]]:
            similar_users.append(users_list[next_user])

    return similar_users

def predict(df, user, movie, weights, weight_sort, users_list, user_movie_bags, 
            means):
    
    user_idx = users_list.index(user)
    similar_users = get_similar_users(user_idx, movie, weight_sort, users_list,
                                      user_movie_bags)
    dev_sum = 0
    weight_sum = 0
    for ref_user in similar_users:
        ref_user_idx = users_list.index(ref_user)
        ref_mean = float(means[str(ref_user)])
        ref_rating = df.loc[(df["userId"]==ref_user) & (df["movieId"]==movie)]["rating"].values[0]
        dev_sum += weights[user_idx, ref_user_idx] * (ref_rating - ref_mean)
        weight_sum += abs(weights[user_idx, ref_user_idx])

    return means[str(user)] + (dev_sum / weight_sum)

def get_predictions(users_list, weights, test_df, rating_csv, means):

    print("Loading CSV")
    df = pd.read_csv(rating_csv)

    print("Getting dense subset")
    movies = get_most_frequent_movies(df, subset_dim)
    users = get_most_frequent_users(df, subset_dim)
    df = get_subset_data(df, users, movies)
    df = df[~df.isin(test_df)].dropna()

    print("Generating bags")
    user_movie_bags = generate_user_movie_bags(df)

    weight_sort = np.argsort(weights)

    print("Starting predictions")
    predicted = []
    num_rows = len(test_df["userId"].values.tolist())
    for i, row in test_df.iterrows():
        user = row["userId"]
        movie = row["movieId"]
        prediction = predict(df, user, movie, weights, weight_sort, users_list, 
                             user_movie_bags, means)
        predicted.append(prediction)
        if i % 10 == 0:
            print(f"\rCompleted {i + 1} of {num_rows} rows", sep=' ', end='', flush=True)

    return predicted

def accuracy(test_df, predicted):
    true = test_df["rating"].values.tolist()
    preds_clean = []
    true_clean = []
    for i in range(len(predicted)):
        if not np.isnan(predictions[i]):
            preds_clean.append(predicted[i])
            true_clean.append(true[i])
    return mean_squared_error(true_clean, preds_clean)

In [None]:
users_list = np.load("users.npy").tolist()
weights = np.load("weights.npy")
test_df = pd.read_csv("test_df.csv")
means = json.load(open('means.json'))

In [None]:
predictions = get_predictions(users_list, weights, test_df, "rating.csv", means)
acc = accuracy(test_df, predictions)
print(f"\nMSE Error: {acc}")

In [None]:
acc = accuracy(test_df, predictions)
print(f"\nMSE Error: {acc}")


MSE Error: 0.7019937140336509


In [None]:
test_df["rating"].isna().sum()

0

In [None]:
for p in predictions:
    print(p, np.isnan(p))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
3.7101182254935288 False
3.8380859093905944 False
3.7057151501966774 False
3.718915115561652 False
3.7343395599658935 False
3.840823906231159 False
3.8923671943028735 False
3.611493903232603 False
3.7268573216263468 False
nan True
nan True
nan True
nan True
nan True
nan True
4.5386387314156815 False
nan True
4.1157598971204195 False
nan True
nan True
4.570801103285805 False
4.502650579757705 False
nan True
nan True
3.5472224667703856 False
3.7860093219709254 False
4.446459299502747 False
nan True
nan True
nan True
nan True
nan True
nan True
4.325340494484862 False
nan True
3.8211983983624402 False
nan True
nan True
nan True
3.629733877785622 False
nan True
nan True
4.061356927793989 False
nan True
nan True
3.9384196154538813 False
5.216620307633691 False
nan True
nan True
4.2696080034739 False
5.287380612737472 False
4.110137737695549 False
nan True
nan True
3.6811662242937415 False
nan True
nan True
2.0800703623333523 Fa

KeyboardInterrupt: ignored

# User-Item Alternating Least Squares

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!mkdir Recommendations
!mount --bind /content/drive/My\ Drive/Recommendations /content/Recommendations/
%cd /content/Recommendations

/content/Recommendations


In [None]:
import pandas as pd
import numpy as np

In [29]:
subset_dim = 0.001

def get_most_frequent_movies(df, subset_size):

    ordered = df.groupby(['movieId']).agg(['count']).sort_values(('userId', 'count'), ascending=False)[('userId', 'count')]
    ordered_ids = ordered.index.values.tolist()
    return ordered_ids[:round(subset_size*len(ordered_ids))]  

def get_most_frequent_users(df, subset_size):
    
    ordered = df.groupby(['userId']).agg(['count']).sort_values(('movieId', 'count'), ascending=False)[('movieId', 'count')]
    ordered_ids = ordered.index.values.tolist()
    return ordered_ids[:round(subset_size*len(ordered_ids))]

def get_subset_data(df, users, movies):

    df = df.loc[df['userId'].isin(users)]
    df = df.loc[df['movieId'].isin(movies)]
    return df

def train_test_split(df, test_frac=0.2):
    
    test_df = df.sample(frac=test_frac).sort_values(['userId', 'movieId'])
    train_df = df[~df.isin(test_df)].dropna()
    return train_df, test_df  

def preprocess_data(rating_csv, subset_dim=0.01):

    print("Reading CSV file")
    df = pd.read_csv(rating_csv)

    print("Getting most frequent users")
    movies = get_most_frequent_movies(df, subset_dim)
    users = get_most_frequent_users(df, subset_dim)

    df = get_subset_data(df, users, movies)
    train_df, test_df = train_test_split(df)

    return train_df, test_df, users, movies

In [30]:
train_df, test_df, users, movies = preprocess_data('rating.csv')

Reading CSV file
Getting most frequent users


In [31]:
def get_loss(w, u, b, c, mu, users, movies, df):

    loss = 0
    for user_idx in range(len(w)):
        user = users[user_idx]
        for movie_idx in range(len(u)):
            #if movie_idx == 0 and user_idx == 0:
            #    print("u0: ", u[movie_idx])
            movie = movies[movie_idx]
            ratings = df.loc[(df["userId"]==user) & (df["movieId"]==movie)]["rating"].values
            if len(ratings):
                rating = ratings[0]
                loss += (rating - np.dot(w[user_idx], u[movie_idx]) - b[user_idx] - c[movie_idx] - mu)**2
        #if user_idx == 0:
        #    print("w0: ", w[user_idx])
    return loss

def train(w, u, b, c, mu, df, users, movies, num_iters, embedding_dim, lambd=1e-6):

    for iter in range(num_iters):
        for idx, i in enumerate(users):

            # Sum over all movies watched by user i of uj times uj transverse 
            A = np.zeros((embedding_dim, embedding_dim))
            # Sum over all movies watched by user i of uj times the rating given by i to j
            d = np.zeros((embedding_dim, 1))
            
            # Create bag of all movies user i has seen
            df_i = df.loc[df["userId"]==i]
            movie_bag = set(df_i["movieId"].values.tolist())

            w_arr = np.expand_dims(w[idx], 1)
            b_sum = 0
            for jdx, j in enumerate(movies):
                if j in movie_bag:
                    u_arr = np.expand_dims(u[jdx], 1)
                    A += np.matmul(u_arr, np.transpose(u_arr))# + np.identity(embedding_dim)*lambd

                    rating = df_i.loc[df_i["movieId"]==j]["rating"].values[0]
                    d = d + (rating - b[idx] - c[jdx] - mu)*u_arr  

                    b_sum += (rating - np.matmul(np.transpose(w_arr), u_arr) - c[jdx] - mu)

            b[idx] = (1 / (len(movie_bag) + lambd))*b_sum 

            A = A + 0.00001*np.random.rand(*A.shape)
            d = d + 0.00001*np.random.rand(*d.shape)
            w[idx] = np.squeeze(np.linalg.solve(A, d))

        for jdx, j in enumerate(movies):

            # Sum over all movies watched by user i of uj times uj transverse 
            A = np.zeros((embedding_dim, embedding_dim))                
            # Sum over all movies watched by user i of uj times the rating given by i to j
            d = np.zeros((embedding_dim, 1))
            
            # Create bag of all movies user i has seen
            df_j = df.loc[df["movieId"]==i]
            user_bag = set(df_j["userId"].values.tolist())

            u_arr = np.expand_dims(u[jdx], 1)
            c_sum = 0
            for idx, i in enumerate(users):
                if i in user_bag:
                    w_arr = np.expand_dims(w[idx], 1)
                    A += np.matmul(w_arr, np.transpose(w_arr))# + np.identity(embedding_dim)*lambd

                    rating = df_j.loc[df_j["userId"]==i]["rating"].values[0]
                    d = d + (rating - b[idx] - c[jdx] - mu)*w_arr

                    c_sum += (rating - np.matmul(np.transpose(w_arr), u_arr) - b[idx] - mu)
            c[jdx] = (1 / (len(user_bag) + lambd))*c_sum 

            A = A + 0.00001*np.random.rand(*A.shape)
            d = d + 0.00001*np.random.rand(*d.shape)
            u[jdx] = np.squeeze(np.linalg.solve(A, d))

        loss = get_loss(w, u, b, c, mu, users, movies, df)
        print(f"Iteration {iter} loss: ", loss)

    return w, u


In [32]:
num_iters = 10
embedding_dim = 10
w = np.random.randn(len(users), embedding_dim)
u = np.random.randn(len(movies), embedding_dim)
b = np.random.randn(len(users))
c = np.random.randn(len(movies))

mu = np.mean(train_df["rating"].values)

w, u = train(w, u, b, c, mu, train_df, users, movies, num_iters, embedding_dim)

Iteration 0 loss:  10619855114.908821
Iteration 1 loss:  65241635.0334046
Iteration 2 loss:  86393746.84001334
Iteration 3 loss:  1740394.57247525
Iteration 4 loss:  40487503.33614634
Iteration 5 loss:  599580.8300651846
Iteration 6 loss:  5037339.808200673
Iteration 7 loss:  351946.5861163508
Iteration 8 loss:  7334506.554359726
Iteration 9 loss:  513490.98161674576
