In [None]:
!pip install scikit-surprise

In [2]:
import pandas as pd
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy

# Sample data: user_id, item_id, rating (could be purchase/view count)
data_dict = {
    'user_id': [1, 2, 3, 4, 5, 1, 2, 3, 4, 5],
    'item_id': [10, 20, 10, 30, 40, 20, 10, 30, 20, 10],
    'rating': [5, 4, 3, 2, 5, 4, 3, 5, 2, 4]
}



In [3]:
# Convert to DataFrame
df = pd.DataFrame(data_dict)

# Define a reader with the expected format
reader = Reader(rating_scale=(1, 5))

# Load data from DataFrame
data = Dataset.load_from_df(df[['user_id', 'item_id', 'rating']], reader)

# Split data into train and test sets
trainset, testset = train_test_split(data, test_size=0.2)


In [4]:
# Use KNNBasic algorithm for collaborative filtering
algo = KNNBasic()

# Train the algorithm on the trainset
algo.fit(trainset)

# Test the algorithm on the testset
predictions = algo.test(testset)

# Calculate and print RMSE
rmse = accuracy.rmse(predictions)
print(f'RMSE: {rmse}')

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0607
RMSE: 1.0606601717798212


In [21]:
# Function to get top N recommendations for a given user
def get_top_n_recommendations(user_id, n):
    # Get a list of all item_ids
    item_ids = df['item_id'].unique()
    
    # Predict ratings for all items not yet rated by the user
    user_rated_items = df[df['user_id'] == user_id]['item_id']
    items_to_predict = [item for item in item_ids if item not in user_rated_items]
    
    # Handle case where there are not enough items to recommend
    if len(items_to_predict) < n:
        print(f"Warning: Only {len(items_to_predict)} items available to recommend.")
    
    # Predict ratings for the items
    predictions = [algo.predict(user_id, item_id) for item_id in items_to_predict]
    
    # Sort predictions by estimated rating
    predictions.sort(key=lambda x: x.est, reverse=True)
    
    # Get the top n recommendations
    top_n_recommendations = predictions[:n]
    
    return [(pred.iid, pred.est) for pred in top_n_recommendations]

# Get top 10 recommendations for user_id 1
user_id = 1
recommendations = get_top_n_recommendations(user_id, n=10)

# Print recommendations in table format
recommendations_df = pd.DataFrame(recommendations, columns=['Item ID', 'Estimated Rating'])
print(f"For user_id_{user_id}: \n", recommendations_df)

For user_id_1: 
    Item ID  Estimated Rating
0       40          5.000000
1       10          4.315789
2       20          3.666667
3       30          2.000000


## At this point we have a proof-of-concept recommendation model complete. We now need to simply improve on it and get it functionally available for end-users to utilize with an API layer, etc. The DevOps or machine learning or data engineering team will get it into a productionalized / test environment to obtain feedback and the data science team team can start working on ways to improve the model (i.e obtaining more data, different algorithms, hyperparameter tuning, grid search, feature engineering)