# Import Statements

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# Read Files

In [2]:
base_path = 'Dataset/ml-100k/'

In [3]:
cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
train_1 = pd.read_csv(base_path+'u3.base', sep='\t', names=cols, encoding='latin-1')
test_1 = pd.read_csv(base_path+'u3.test', sep='\t', names=cols, encoding='latin-1')

# Train Matrix

In [4]:
train_matrix_1 = np.zeros((943,1682))

In [5]:
for i in range(80000):
    train_matrix_1[train_1.iloc[i,0]-1][train_1.iloc[i,1]-1] = train_1.iloc[i,2]

# Test Matrix

In [6]:
test_matrix_1 = np.zeros((943,1682))

In [7]:
for i in range(20000):
    test_matrix_1[test_1.iloc[i,0]-1][test_1.iloc[i,1]-1] = test_1.iloc[i,2]

In [8]:
train_matrix_1.shape

(943, 1682)

# Mean Rating by User

In [9]:
mean_rating_1 = np.average(train_matrix_1, axis=1, weights=(train_matrix_1 > 0))

In [10]:
mean_rating_1.shape

(943,)

# Mean Rating Of Item

In [11]:
mean_rating_item_1 = np.zeros((1682,))

In [12]:
for i in range(1682):
    summ = 0
    counter = 0
    for j in range(943):
        if train_matrix_1[j][i]>0:
            summ += train_matrix_1[j][i]
            counter += 1
    if counter>0:
        mean_rating_item_1[i] = summ/counter

# Variance Of Item

In [13]:
variance_rating_item_1 = np.zeros((1682,))

In [14]:
for i in range(1682):
    if mean_rating_item_1[i]==0:
        continue
    summ = 0
    counter = 0
    for j in range(943):
        if train_matrix_1[j][i]>0:
            summ += (train_matrix_1[j][i]-mean_rating_item_1[i])*(train_matrix_1[j][i]-mean_rating_item_1[i])
            counter += 1
    if counter-1>0:
        variance_rating_item_1[i] = summ/(counter-1)

# Number of co-rated items matrix

In [15]:
co_rated_matrix_1 = np.zeros((943,943))

In [16]:
for i in tqdm(range(943)):
    for j in range(i+1,943):
        for k in range(1682):
            if(train_matrix_1[i][k]>0 and train_matrix_1[j][k]>0):
                co_rated_matrix_1[i][j]+=1
                co_rated_matrix_1[j][i]+=1

100%|████████████████████████████████████████████████████████████████████████████████| 943/943 [07:07<00:00,  2.21it/s]


# User Cosine similarity Matrix

In [17]:
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
cosine_similarity_1 = cosine_similarity(train_matrix_1)

# Item Cosine similarity Matrix

In [19]:
cosine_similarity_item = cosine_similarity(train_matrix_1.T)

# User Based Prediction

## Fold 1 k = 10, 20, 30, 40, 50

In [20]:
for k in [-11,-21,-31,-41,-51]:
    total_predictions_1 = 0
    total_absolute_error_1 = 0
    for a in tqdm(range(943)):
        # calculating top k neighbours of active user 'a'
        # k = actual k + 1 since argpartition will return 'a' itself as well as cosine-sim(a,a) is 1
        neighbours = np.argpartition(cosine_similarity_1[a], k)[k:]
        for j in range(1682):
            if test_matrix_1[a][j]==0:
                continue
            summation=0
            normalizing_factor=0
            for i in neighbours:
                if i==a:
                    continue
                if train_matrix_1[i][j]==0:
                    continue
                summation += cosine_similarity_1[a][i]*(train_matrix_1[i][j]-mean_rating_1[i])
                normalizing_factor += abs(cosine_similarity_1[a][i])
            if normalizing_factor==0:
                continue
            predicted_rating_a_j_1 = round(mean_rating_1[a] + (summation/normalizing_factor))
            total_absolute_error_1 += abs(predicted_rating_a_j_1 - test_matrix_1[a][j])
            total_predictions_1 += 1
    coverage_1 = (total_predictions_1/20000)*100
    mean_absolute_error_1 = total_absolute_error_1/total_predictions_1
    print('Coverage: {} {}'.format(coverage_1, '%'))
    print('Mean Absolute Error Fold 1 k={}: {}'.format(-1*k-1,mean_absolute_error_1))

100%|███████████████████████████████████████████████████████████████████████████████| 943/943 [00:01<00:00, 711.66it/s]


Coverage: 90.19500000000001 %
Mean Absolute Error Fold 1 k=10: 0.7697211597095183


100%|███████████████████████████████████████████████████████████████████████████████| 943/943 [00:01<00:00, 559.42it/s]


Coverage: 95.5 %
Mean Absolute Error Fold 1 k=20: 0.7384293193717277


100%|███████████████████████████████████████████████████████████████████████████████| 943/943 [00:01<00:00, 522.59it/s]


Coverage: 97.06 %
Mean Absolute Error Fold 1 k=30: 0.7249124253039357


100%|███████████████████████████████████████████████████████████████████████████████| 943/943 [00:02<00:00, 434.89it/s]


Coverage: 97.78999999999999 %
Mean Absolute Error Fold 1 k=40: 0.7166888229880356


100%|███████████████████████████████████████████████████████████████████████████████| 943/943 [00:02<00:00, 363.76it/s]

Coverage: 98.22 %
Mean Absolute Error Fold 1 k=50: 0.7100895947872123





## Fold 1 k = 10, 20, 30, 40, 50 Significance Weighting

In [21]:
for k in [-11,-21,-31,-41,-51]:
    total_predictions_1_sw = 0
    total_absolute_error_1_sw = 0
    for a in tqdm(range(943)):
        # calculating top k neighbours of active user 'a'
        # k = actual k + 1 since argpartition will return 'a' itself as well as cosine-sim(a,a) is 1
        threshold = -1*k-1
        neighbours = np.argpartition(cosine_similarity_1[a], k)[k:]
        for j in range(1682):
            if test_matrix_1[a][j]==0:
                continue
            summation=0
            normalizing_factor=0
            for i in neighbours:
                if i==a:
                    continue
                if train_matrix_1[i][j]==0:
                    continue
                significance_weight = 0
                if co_rated_matrix_1[a][i]<threshold:
                    significance_weight = co_rated_matrix_1[a][i]/threshold
                else:
                    significance_weight = 1
                summation += cosine_similarity_1[a][i]*significance_weight*(train_matrix_1[i][j]-mean_rating_1[i])
                normalizing_factor += abs(cosine_similarity_1[a][i]*significance_weight)
            if normalizing_factor==0:
                continue
            predicted_rating_a_j_1_sw = round(mean_rating_1[a] + (summation/normalizing_factor))
            total_absolute_error_1_sw += abs(predicted_rating_a_j_1_sw - test_matrix_1[a][j])
            total_predictions_1_sw += 1
    coverage_1_sw = (total_predictions_1_sw/20000)*100
    mean_absolute_error_1_sw = total_absolute_error_1_sw/total_predictions_1_sw
    print('Coverage_sw: {} {}'.format(coverage_1_sw, '%'))
    print('Mean Absolute Error Fold 1 k=10 sw: {}'.format(mean_absolute_error_1_sw))

100%|███████████████████████████████████████████████████████████████████████████████| 943/943 [00:01<00:00, 555.79it/s]


Coverage_sw: 90.19500000000001 %
Mean Absolute Error Fold 1 k=10 sw: 0.7697211597095183


100%|███████████████████████████████████████████████████████████████████████████████| 943/943 [00:02<00:00, 394.83it/s]


Coverage_sw: 95.5 %
Mean Absolute Error Fold 1 k=10 sw: 0.7398952879581152


100%|███████████████████████████████████████████████████████████████████████████████| 943/943 [00:02<00:00, 385.40it/s]


Coverage_sw: 97.06 %
Mean Absolute Error Fold 1 k=10 sw: 0.726869977333608


100%|███████████████████████████████████████████████████████████████████████████████| 943/943 [00:02<00:00, 347.94it/s]


Coverage_sw: 97.78999999999999 %
Mean Absolute Error Fold 1 k=10 sw: 0.7172512526843235


100%|███████████████████████████████████████████████████████████████████████████████| 943/943 [00:02<00:00, 325.95it/s]

Coverage_sw: 98.22 %
Mean Absolute Error Fold 1 k=10 sw: 0.7118713093056404





# Item Based Prediction

## Fold 1 k = 10, 20, 30, 40, 50

In [22]:
for k in [-11,-21,-31,-41,-51]:
    total_predictions_1 = 0
    total_absolute_error_1 = 0
    for j in tqdm(range(1682)):
        # calculating top k neighbours of item 'j'
        # active user 'a'
        # k = actual k + 1 since argpartition will return 'a' itself as well as cosine-sim(j,j) is 1
        neighbours = np.argpartition(cosine_similarity_item[j], k)[k:]
        for a in range(943):
            if test_matrix_1[a][j]==0:
                continue
            summation=0
            normalizing_factor=0
            for i in neighbours:
                if i==j:
                    continue
                if train_matrix_1[a][i]==0:
                    continue
                summation += cosine_similarity_item[j][i]*(train_matrix_1[a][i]-mean_rating_item_1[i])
                normalizing_factor += abs(cosine_similarity_item[j][i])
            if normalizing_factor==0:
                continue
            predicted_rating_a_j_1 = round(mean_rating_item_1[j] + (summation/normalizing_factor))
            total_absolute_error_1 += abs(predicted_rating_a_j_1 - test_matrix_1[a][j])
            total_predictions_1 += 1
    coverage_1 = (total_predictions_1/20000)*100
    mean_absolute_error_1 = total_absolute_error_1/total_predictions_1
    print('Coverage: {} {}'.format(coverage_1, '%'))
    print('Mean Absolute Error Fold 1 k={}: {}'.format(-1*k-1,mean_absolute_error_1))

100%|████████████████████████████████████████████████████████████████████████████| 1682/1682 [00:01<00:00, 1186.30it/s]


Coverage: 90.945 %
Mean Absolute Error Fold 1 k=10: 0.7219198416625433


100%|█████████████████████████████████████████████████████████████████████████████| 1682/1682 [00:01<00:00, 970.07it/s]


Coverage: 95.815 %
Mean Absolute Error Fold 1 k=20: 0.701299379011637


100%|█████████████████████████████████████████████████████████████████████████████| 1682/1682 [00:02<00:00, 832.29it/s]


Coverage: 97.455 %
Mean Absolute Error Fold 1 k=30: 0.6929352008619363


100%|█████████████████████████████████████████████████████████████████████████████| 1682/1682 [00:02<00:00, 667.54it/s]


Coverage: 98.31 %
Mean Absolute Error Fold 1 k=40: 0.6888922795239548


100%|█████████████████████████████████████████████████████████████████████████████| 1682/1682 [00:02<00:00, 636.02it/s]

Coverage: 98.795 %
Mean Absolute Error Fold 1 k=50: 0.6877372336656713





## Fold 1 k = 10, 20, 30, 40, 50 Variance Weighting

In [32]:
for k in [-11,-21,-31,-41,-51]:
    total_predictions_1 = 0
    total_absolute_error_1 = 0
    for j in tqdm(range(1682)):
        # calculating top k neighbours of item 'j'
        # active user 'a'
        # k = actual k + 1 since argpartition will return 'a' itself as well as cosine-sim(j,j) is 1
        neighbours = np.argpartition(cosine_similarity_item[j], k)[k:]
        for a in range(943):
            if test_matrix_1[a][j]==0:
                continue
            summation=0
            normalizing_factor=0
            for i in neighbours:
                if i==j:
                    continue
                if train_matrix_1[a][i]==0:
                    continue
                if(max(variance_rating_item_1[neighbours])==min(variance_rating_item_1[neighbours])):
                    variance_weight = variance_rating_item_1[j]-min(variance_rating_item_1[neighbours])
                else:
                    variance_weight = (variance_rating_item_1[j]-min(variance_rating_item_1[neighbours]))/(max(variance_rating_item_1[neighbours])-min(variance_rating_item_1[neighbours]))
                summation += cosine_similarity_item[j][i]*variance_weight*(train_matrix_1[a][i]-mean_rating_item_1[i])
                normalizing_factor += abs(cosine_similarity_item[j][i]*variance_weight)
            if normalizing_factor==0:
                continue
            predicted_rating_a_j_1 = round(mean_rating_item_1[j] + (summation/normalizing_factor))
            total_absolute_error_1 += abs(predicted_rating_a_j_1 - test_matrix_1[a][j])
            total_predictions_1 += 1
    coverage_1 = (total_predictions_1/20000)*100
    mean_absolute_error_1 = total_absolute_error_1/total_predictions_1
    print('Coverage: {} {}'.format(coverage_1, '%'))
    print('Mean Absolute Error Fold 1 k={}: {}'.format(-1*k-1,mean_absolute_error_1))

100%|█████████████████████████████████████████████████████████████████████████████| 1682/1682 [00:01<00:00, 846.95it/s]


Coverage: 86.0 %
Mean Absolute Error Fold 1 k=10: 0.724593023255814


100%|█████████████████████████████████████████████████████████████████████████████| 1682/1682 [00:03<00:00, 470.23it/s]


Coverage: 93.135 %
Mean Absolute Error Fold 1 k=20: 0.7030654426370323


100%|█████████████████████████████████████████████████████████████████████████████| 1682/1682 [00:05<00:00, 304.88it/s]


Coverage: 95.855 %
Mean Absolute Error Fold 1 k=30: 0.6930259245735747


100%|█████████████████████████████████████████████████████████████████████████████| 1682/1682 [00:07<00:00, 212.42it/s]


Coverage: 97.0 %
Mean Absolute Error Fold 1 k=40: 0.6887628865979382


100%|█████████████████████████████████████████████████████████████████████████████| 1682/1682 [00:12<00:00, 130.72it/s]

Coverage: 97.91499999999999 %
Mean Absolute Error Fold 1 k=50: 0.6870244599908083



