# Import Statements

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# Read Files

In [2]:
base_path = 'Dataset/ml-100k/'

In [3]:
cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
train_1 = pd.read_csv(base_path+'u2.base', sep='\t', names=cols, encoding='latin-1')
test_1 = pd.read_csv(base_path+'u2.test', sep='\t', names=cols, encoding='latin-1')

# Train Matrix

In [4]:
train_matrix_1 = np.zeros((943,1682))

In [5]:
for i in range(80000):
    train_matrix_1[train_1.iloc[i,0]-1][train_1.iloc[i,1]-1] = train_1.iloc[i,2]

# Test Matrix

In [6]:
test_matrix_1 = np.zeros((943,1682))

In [7]:
for i in range(20000):
    test_matrix_1[test_1.iloc[i,0]-1][test_1.iloc[i,1]-1] = test_1.iloc[i,2]

In [8]:
train_matrix_1.shape

(943, 1682)

# Mean Rating by User

In [9]:
mean_rating_1 = np.average(train_matrix_1, axis=1, weights=(train_matrix_1 > 0))

In [10]:
mean_rating_1.shape

(943,)

# Mean Rating Of Item

In [11]:
mean_rating_item_1 = np.zeros((1682,))

In [12]:
for i in range(1682):
    summ = 0
    counter = 0
    for j in range(943):
        if train_matrix_1[j][i]>0:
            summ += train_matrix_1[j][i]
            counter += 1
    if counter>0:
        mean_rating_item_1[i] = summ/counter

# Variance Of Item

In [13]:
variance_rating_item_1 = np.zeros((1682,))

In [14]:
for i in range(1682):
    if mean_rating_item_1[i]==0:
        continue
    summ = 0
    counter = 0
    for j in range(943):
        if train_matrix_1[j][i]>0:
            summ += (train_matrix_1[j][i]-mean_rating_item_1[i])*(train_matrix_1[j][i]-mean_rating_item_1[i])
            counter += 1
    if counter-1>0:
        variance_rating_item_1[i] = summ/(counter-1)

# Number of co-rated items matrix

In [15]:
co_rated_matrix_1 = np.zeros((943,943))

In [16]:
for i in tqdm(range(943)):
    for j in range(i+1,943):
        for k in range(1682):
            if(train_matrix_1[i][k]>0 and train_matrix_1[j][k]>0):
                co_rated_matrix_1[i][j]+=1
                co_rated_matrix_1[j][i]+=1

100%|████████████████████████████████████████████████████████████████████████████████| 943/943 [06:53<00:00,  2.28it/s]


# User Cosine similarity Matrix

In [17]:
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
cosine_similarity_1 = cosine_similarity(train_matrix_1)

# Item Cosine similarity Matrix

In [19]:
cosine_similarity_item = cosine_similarity(train_matrix_1.T)

# User Based Prediction

## Fold 1 k = 10, 20, 30, 40, 50

In [20]:
for k in [-11,-21,-31,-41,-51]:
    total_predictions_1 = 0
    total_absolute_error_1 = 0
    for a in tqdm(range(943)):
        # calculating top k neighbours of active user 'a'
        # k = actual k + 1 since argpartition will return 'a' itself as well as cosine-sim(a,a) is 1
        neighbours = np.argpartition(cosine_similarity_1[a], k)[k:]
        for j in range(1682):
            if test_matrix_1[a][j]==0:
                continue
            summation=0
            normalizing_factor=0
            for i in neighbours:
                if i==a:
                    continue
                if train_matrix_1[i][j]==0:
                    continue
                summation += cosine_similarity_1[a][i]*(train_matrix_1[i][j]-mean_rating_1[i])
                normalizing_factor += abs(cosine_similarity_1[a][i])
            if normalizing_factor==0:
                continue
            predicted_rating_a_j_1 = round(mean_rating_1[a] + (summation/normalizing_factor))
            total_absolute_error_1 += abs(predicted_rating_a_j_1 - test_matrix_1[a][j])
            total_predictions_1 += 1
    coverage_1 = (total_predictions_1/20000)*100
    mean_absolute_error_1 = total_absolute_error_1/total_predictions_1
    print('Coverage: {} {}'.format(coverage_1, '%'))
    print('Mean Absolute Error Fold 1 k={}: {}'.format(-1*k-1,mean_absolute_error_1))

100%|███████████████████████████████████████████████████████████████████████████████| 943/943 [00:01<00:00, 658.92it/s]


Coverage: 90.12 %
Mean Absolute Error Fold 1 k=10: 0.7642587660896583


100%|███████████████████████████████████████████████████████████████████████████████| 943/943 [00:01<00:00, 566.92it/s]


Coverage: 95.09 %
Mean Absolute Error Fold 1 k=20: 0.7364601956041644


100%|███████████████████████████████████████████████████████████████████████████████| 943/943 [00:02<00:00, 442.88it/s]


Coverage: 96.855 %
Mean Absolute Error Fold 1 k=30: 0.7187548397088431


100%|███████████████████████████████████████████████████████████████████████████████| 943/943 [00:02<00:00, 423.54it/s]


Coverage: 97.75500000000001 %
Mean Absolute Error Fold 1 k=40: 0.7157690143726664


100%|███████████████████████████████████████████████████████████████████████████████| 943/943 [00:02<00:00, 357.49it/s]

Coverage: 98.18 %
Mean Absolute Error Fold 1 k=50: 0.7118557751069464





## Fold 1 k = 10, 20, 30, 40, 50 Significance Weighting

In [21]:
for k in [-11,-21,-31,-41,-51]:
    total_predictions_1_sw = 0
    total_absolute_error_1_sw = 0
    for a in tqdm(range(943)):
        # calculating top k neighbours of active user 'a'
        # k = actual k + 1 since argpartition will return 'a' itself as well as cosine-sim(a,a) is 1
        threshold = -1*k-1
        neighbours = np.argpartition(cosine_similarity_1[a], k)[k:]
        for j in range(1682):
            if test_matrix_1[a][j]==0:
                continue
            summation=0
            normalizing_factor=0
            for i in neighbours:
                if i==a:
                    continue
                if train_matrix_1[i][j]==0:
                    continue
                significance_weight = 0
                if co_rated_matrix_1[a][i]<threshold:
                    significance_weight = co_rated_matrix_1[a][i]/threshold
                else:
                    significance_weight = 1
                summation += cosine_similarity_1[a][i]*significance_weight*(train_matrix_1[i][j]-mean_rating_1[i])
                normalizing_factor += abs(cosine_similarity_1[a][i]*significance_weight)
            if normalizing_factor==0:
                continue
            predicted_rating_a_j_1_sw = round(mean_rating_1[a] + (summation/normalizing_factor))
            total_absolute_error_1_sw += abs(predicted_rating_a_j_1_sw - test_matrix_1[a][j])
            total_predictions_1_sw += 1
    coverage_1_sw = (total_predictions_1_sw/20000)*100
    mean_absolute_error_1_sw = total_absolute_error_1_sw/total_predictions_1_sw
    print('Coverage_sw: {} {}'.format(coverage_1_sw, '%'))
    print('Mean Absolute Error Fold 1 k=10 sw: {}'.format(mean_absolute_error_1_sw))

100%|███████████████████████████████████████████████████████████████████████████████| 943/943 [00:01<00:00, 610.35it/s]


Coverage_sw: 90.12 %
Mean Absolute Error Fold 1 k=10 sw: 0.7645361739902352


100%|███████████████████████████████████████████████████████████████████████████████| 943/943 [00:01<00:00, 535.26it/s]


Coverage_sw: 95.09 %
Mean Absolute Error Fold 1 k=10 sw: 0.7379850667788411


100%|███████████████████████████████████████████████████████████████████████████████| 943/943 [00:02<00:00, 408.76it/s]


Coverage_sw: 96.855 %
Mean Absolute Error Fold 1 k=10 sw: 0.7204067936606268


100%|███████████████████████████████████████████████████████████████████████████████| 943/943 [00:02<00:00, 369.69it/s]


Coverage_sw: 97.75500000000001 %
Mean Absolute Error Fold 1 k=10 sw: 0.7160759040458289


100%|███████████████████████████████████████████████████████████████████████████████| 943/943 [00:03<00:00, 290.13it/s]

Coverage_sw: 98.18 %
Mean Absolute Error Fold 1 k=10 sw: 0.7137400692605419





# Item Based Prediction

## Fold 1 k = 10, 20, 30, 40, 50

In [22]:
for k in [-11,-21,-31,-41,-51]:
    total_predictions_1 = 0
    total_absolute_error_1 = 0
    for j in tqdm(range(1682)):
        # calculating top k neighbours of item 'j'
        # active user 'a'
        # k = actual k + 1 since argpartition will return 'a' itself as well as cosine-sim(j,j) is 1
        neighbours = np.argpartition(cosine_similarity_item[j], k)[k:]
        for a in range(943):
            if test_matrix_1[a][j]==0:
                continue
            summation=0
            normalizing_factor=0
            for i in neighbours:
                if i==j:
                    continue
                if train_matrix_1[a][i]==0:
                    continue
                summation += cosine_similarity_item[j][i]*(train_matrix_1[a][i]-mean_rating_item_1[i])
                normalizing_factor += abs(cosine_similarity_item[j][i])
            if normalizing_factor==0:
                continue
            predicted_rating_a_j_1 = round(mean_rating_item_1[j] + (summation/normalizing_factor))
            total_absolute_error_1 += abs(predicted_rating_a_j_1 - test_matrix_1[a][j])
            total_predictions_1 += 1
    coverage_1 = (total_predictions_1/20000)*100
    mean_absolute_error_1 = total_absolute_error_1/total_predictions_1
    print('Coverage: {} {}'.format(coverage_1, '%'))
    print('Mean Absolute Error Fold 1 k={}: {}'.format(-1*k-1,mean_absolute_error_1))

100%|████████████████████████████████████████████████████████████████████████████| 1682/1682 [00:01<00:00, 1275.42it/s]


Coverage: 90.75 %
Mean Absolute Error Fold 1 k=10: 0.7269972451790634


100%|████████████████████████████████████████████████████████████████████████████| 1682/1682 [00:01<00:00, 1059.86it/s]


Coverage: 95.59 %
Mean Absolute Error Fold 1 k=20: 0.6996547756041427


100%|█████████████████████████████████████████████████████████████████████████████| 1682/1682 [00:01<00:00, 865.19it/s]


Coverage: 97.195 %
Mean Absolute Error Fold 1 k=30: 0.6875868100210916


100%|█████████████████████████████████████████████████████████████████████████████| 1682/1682 [00:02<00:00, 787.74it/s]


Coverage: 98.155 %
Mean Absolute Error Fold 1 k=40: 0.6870256227395446


100%|█████████████████████████████████████████████████████████████████████████████| 1682/1682 [00:02<00:00, 665.68it/s]

Coverage: 98.72 %
Mean Absolute Error Fold 1 k=50: 0.6865883306320908





## Fold 1 k = 10, 20, 30, 40, 50 Variance Weighting

In [23]:
for k in [-11,-21,-31,-41,-51]:
    total_predictions_1 = 0
    total_absolute_error_1 = 0
    for j in tqdm(range(1682)):
        # calculating top k neighbours of item 'j'
        # active user 'a'
        # k = actual k + 1 since argpartition will return 'a' itself as well as cosine-sim(j,j) is 1
        neighbours = np.argpartition(cosine_similarity_item[j], k)[k:]
        for a in range(943):
            if test_matrix_1[a][j]==0:
                continue
            summation=0
            normalizing_factor=0
            for i in neighbours:
                if i==j:
                    continue
                if train_matrix_1[a][i]==0:
                    continue
                if(max(variance_rating_item_1[neighbours])==min(variance_rating_item_1[neighbours])):
                    variance_weight = variance_rating_item_1[j]-min(variance_rating_item_1[neighbours])
                else:
                    variance_weight = (variance_rating_item_1[j]-min(variance_rating_item_1[neighbours]))/(max(variance_rating_item_1[neighbours])-min(variance_rating_item_1[neighbours]))
                summation += cosine_similarity_item[j][i]*variance_weight*(train_matrix_1[a][i]-mean_rating_item_1[i])
                normalizing_factor += abs(cosine_similarity_item[j][i]*variance_weight)
            if normalizing_factor==0:
                continue
            predicted_rating_a_j_1 = round(mean_rating_item_1[j] + (summation/normalizing_factor))
            total_absolute_error_1 += abs(predicted_rating_a_j_1 - test_matrix_1[a][j])
            total_predictions_1 += 1
    coverage_1 = (total_predictions_1/20000)*100
    mean_absolute_error_1 = total_absolute_error_1/total_predictions_1
    print('Coverage: {} {}'.format(coverage_1, '%'))
    print('Mean Absolute Error Fold 1 k={}: {}'.format(-1*k-1,mean_absolute_error_1))

100%|█████████████████████████████████████████████████████████████████████████████| 1682/1682 [00:02<00:00, 737.72it/s]


Coverage: 84.25 %
Mean Absolute Error Fold 1 k=10: 0.7322848664688427


100%|█████████████████████████████████████████████████████████████████████████████| 1682/1682 [00:03<00:00, 425.31it/s]


Coverage: 92.43 %
Mean Absolute Error Fold 1 k=20: 0.7017743156983663


100%|█████████████████████████████████████████████████████████████████████████████| 1682/1682 [00:05<00:00, 281.46it/s]


Coverage: 95.06 %
Mean Absolute Error Fold 1 k=30: 0.6890911003576689


100%|█████████████████████████████████████████████████████████████████████████████| 1682/1682 [00:09<00:00, 178.38it/s]


Coverage: 96.585 %
Mean Absolute Error Fold 1 k=40: 0.6872702800641921


100%|█████████████████████████████████████████████████████████████████████████████| 1682/1682 [00:12<00:00, 139.57it/s]

Coverage: 97.38 %
Mean Absolute Error Fold 1 k=50: 0.686280550421031



