In [13]:
import os
import sys

# Add the path to the constants file to the system path
sys.path.append('../../')
from constants import *
from evaluation_utils import *
from path_utils import *
from ChatCompletion_OpenAI_API import *
from CF_utils import *



import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

In [14]:
# source code folder path
rec_sys_dir = get_rec_sys_directory()
print(f"Rec-sys directory: {rec_sys_dir}")

# data folder path
DATA_DIR = os.path.join(rec_sys_dir, 'data')
print(f"Data directory: {DATA_DIR}")

# data path
data_path = os.path.join(DATA_DIR, 'movie-ml-latest-small/merged_data.csv')
print(f'Data path: {data_path}')



# few shot save path
CF_FEW_SHOT_1_OBS_SAVE_PATH = os.path.join(DATA_DIR, 'movie-ml-latest-small/CF_large_1_test_predictions_few_shot.csv')
print(f'Few shot save path: {CF_FEW_SHOT_1_OBS_SAVE_PATH}')

Rec-sys directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys
Data directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data
Data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/movie-ml-latest-small/merged_data.csv
Few shot save path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/movie-ml-latest-small/CF_large_1_test_predictions_few_shot.csv


In [15]:
# Read the data
data = pd.read_csv(data_path)

# get statistic and first few data of NUM_SAMPLES rows
data.info()
data.head(NUM_EXAMPLES)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3405 entries, 0 to 3404
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  3405 non-null   int64  
 1   imdbId   3405 non-null   int64  
 2   tmdbId   3405 non-null   float64
 3   title    3405 non-null   object 
 4   genres   3405 non-null   object 
 5   userId   3405 non-null   int64  
 6   rating   3405 non-null   float64
 7   tag      3405 non-null   object 
dtypes: float64(2), int64(3), object(3)
memory usage: 212.9+ KB


Unnamed: 0,movieId,imdbId,tmdbId,title,genres,userId,rating,tag
0,1,114709,862.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,pixar
1,1,114709,862.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,4.0,pixar
2,1,114709,862.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,3.5,fun
3,2,113497,8844.0,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,fantasy
4,2,113497,8844.0,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,magic board game


In [16]:
ratings_matrix = pd.pivot_table(data, values='rating', index='userId', columns='movieId')
ratings_matrix

movieId,1,2,5,7,11,14,16,17,21,22,...,176371,176419,179401,180031,180985,183611,184471,187593,187595,193565
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,,,,,,,,,,,...,,,,,,,,,,
18,,,,,,,,,,,...,,,,,,,,,,
62,,4.0,,,,,,,,,...,,,3.5,,,4.0,3.5,4.0,4.0,
103,,,,,,,,,,,...,,,,,,,,,,
119,,,,,,,,,,,...,,,,,,,,,,
125,,,,,,,,,,,...,,,,,,,,,,
166,,,,,,,,,,,...,,,,,,,,,,
184,,,,,,,,,,,...,,,,,,,,,,3.5
193,,,,,,,,,,,...,,,,,,,,,,
305,,,,,,,,,,,...,,,,,,,,,,


In [17]:
normalized_ratings_matrix = ratings_matrix.subtract(ratings_matrix.mean(axis=1), axis=0)
normalized_ratings_matrix

movieId,1,2,5,7,11,14,16,17,21,22,...,176371,176419,179401,180031,180985,183611,184471,187593,187595,193565
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,,,,,,,,,,,...,,,,,,,,,,
18,,,,,,,,,,,...,,,,,,,,,,
62,,0.130435,,,,,,,,,...,,,-0.369565,,,0.130435,-0.369565,0.130435,0.130435,
103,,,,,,,,,,,...,,,,,,,,,,
119,,,,,,,,,,,...,,,,,,,,,,
125,,,,,,,,,,,...,,,,,,,,,,
166,,,,,,,,,,,...,,,,,,,,,,
184,,,,,,,,,,,...,,,,,,,,,,-0.4375
193,,,,,,,,,,,...,,,,,,,,,,
305,,,,,,,,,,,...,,,,,,,,,,


In [18]:
similarity_matrix = ratings_matrix.T.corr()
similarity_matrix

userId,2,18,62,103,119,125,166,184,193,305,...,419,424,462,474,477,537,567,573,599,606
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,,,,,,,,,,,...,,,,,,,,,,
18,,1.0,,,,,,,,,...,,,,1.0,,,,,,
62,,,1.0,,,,,,,,...,,1.0,,0.187867,,,,,,
103,,,,1.0,,,,,,,...,,,,,,,,,,
119,,,,,1.0,,,,,,...,,,,,,,,,,
125,,,,,,1.0,,,,,...,,,,,,,,,,
166,,,,,,,1.0,,,,...,,,,,,,,,,
184,,,,,,,,1.0,,,...,,,,0.57735,,,,,,
193,,,,,,,,,1.0,,...,,,,-1.0,,,,,,
305,,,,,,,,,,1.0,...,,,,,,,,,,


In [None]:
def calculate_score(u, i):
    # Check whether the item is in the training dataset
    if i not in ratings_matrix.columns:
        return 2.5
similarity_scores = similarity_matrix[u].drop(labels=u)
normalized_ratings = normalized_ratings_matrix[i].drop(index=u)
# Drop users that haven't rated the item
    similarity_scores.drop(index=normalized_ratings[normalized_ratings.isnull()].index, inplace=True)
    normalized_ratings.dropna(inplace=True)
    
    # If none of the other users have rated items in common with the user in question return the baseline value
    if similarity_scores.isna().all():
        return 2.5
    
    total_score = 0
    total_weight = 0
    for v in normalized_ratings.index:        
        # It's possible that another user rated the item but that
        # they have not rated any items in common with the user in question
        if not pd.isna(similarity_scores[v]):
            total_score += normalized_ratings[v] * similarity_scores[v]
            total_weight += abs(similarity_scores[v])
            
    avg_user_rating = ratings_matrix.T.mean()[u]
    
    return avg_user_rating + total_score / total_weight