# Assignment 3

in this case I'm going to use user-to-user collaborative filtering to recommend items to users.

# 0. Library

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import itertools
from tqdm import tqdm
import math
import scipy.sparse as sparse
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth
import seaborn as sns
from mlxtend.frequent_patterns import apriori, association_rules



# 1. Load Data

In [None]:
df_train = pd.read_csv("https://raw.githubusercontent.com/LDSamuelGuo/ANLP/main/Groceries%20data%20train.csv")
df_test = pd.read_csv("https://raw.githubusercontent.com/LDSamuelGuo/ANLP/main/Groceries%20data%20test.csv")

print('Data Train Head')
display(df_train.head())

print('Data Test Head')
display(df_test.head())

print(f'Train shape: {df_train.shape}')
print(f'Tets shape: {df_test.shape}\n')

print('Train info')
display(df_train.info())
print('Test info')
display(df_test.info())


# 2. Exploratory Data Analysis
In this part, we will explore the data on the user and item side. We also want to know the distribution of the items and users in the dataset. The aim of the EDA is to understand the data better and to know the distribution of the data.

## 2.1 Check Missing Value

In [None]:
print('Data Train')
display(df_train.isna().sum())

print('Data Test')
display(df_test.isna().sum())

## 2.2 Data Cleaning

From the previous part, we could see that the `Member_number	` columns do not have a string type. Therefore, it will be changed to string type.  Lastly, the `Date` column will be changed to datetime type.

In [None]:
def clean_data(dataframe):
    """
    Cleans the given dataframe by performing the following operations:
        1. Converts Member_number columns to string data type.
        2. Converts the 'Date' column to datetime format.


    Parameters:
        - dataframe (pandas.DataFrame): The dataframe to be cleaned.

    Returns:
        - None. The dataframe is modified in-place.
    """

    # change Member_number, Member_number to string
    dataframe['Member_number'] = dataframe['Member_number'].astype(str)



    # change Date to datetime
    dataframe['Date'] = pd.to_datetime(dataframe['Date'], format='%d/%m/%Y')


In [None]:
print(f"Original Train Shape: {df_train.shape}")
print(f'Original Test Shape: {df_test.shape}\n')
clean_data(df_train)
clean_data(df_test)

print(f"Clean Train Shape: {df_train.shape}")
print(f"Clean Test Shape: {df_test.shape}\n")

display(df_train.head())

## 2.3 Min and Max Date

In [None]:
print('Train Data')
print(f"Min Date: {df_train.Date.min()}")
print(f"Max Date: {df_train.Date.max()}\n")

print('Test Data')
print(f"Min Date: {df_test.Date.min()}")
print(f"Max Date: {df_test.Date.max()}")

As we can see, train and test dataset have the same date range

## 2.4 User Analysis
The goal of this analysis is to understand the number of unique member in each dataset also we want to know the distribution of the transaction per customer.

### 2.4.1 Count Unique Member

In [None]:
print('Train Data')
print(f"Num of Unique Member: {df_train.Member_number.nunique()}\n")

print("Test Data")
print(f"Num of Unique Member: {df_test.Member_number.nunique()}\n")

print("All Data")
print(f"Num of Unique Member: {pd.concat([df_train, df_test]).Member_number.nunique()}")

In [None]:
train_cust = df_train.Member_number.str.lower().unique()
test_cust = df_test.Member_number.str.lower().unique()

test_not_in_train_cust = len([x for x in test_cust if x not in train_cust])
print(f"Number of Member in test but not in train: {test_not_in_train_cust}")

## 2.5 Item Analysis

### 2.5.1 Number of Unique Items

In [None]:
print('Train Data')
print(f"Num of Unique Items by Name: {df_train.itemDescription.str.lower().nunique()}\n")

print("Test Data")
print(f"Num of Unique Items by Name: {df_test.itemDescription.str.lower().nunique()}\n")

print("All Data")
print(f"Num of Unique Items by Name: {pd.concat([df_train, df_test]).itemDescription.str.lower().nunique()}")

In [None]:
train_items = df_train.itemDescription.str.lower().unique()
test_items = df_test.itemDescription.str.lower().unique()

test_not_in_train_items = len([x for x in test_items if x not in train_items])
print(f"Number of Memberin test but not in train: {test_not_in_train_items}")

### 2.5.2 Clean Item Name

In [None]:
import re

def remove_color(string):
    """
    Removes  special characters, and extra spaces from a given string.

    Parameters:
        - string (str): The input string from which special characters will be removed.

    Returns:
        - str: The modified string with special characters removed, and extra spaces removed.
    """


    # remove special characters
    string = re.sub(r'[^a-zA-Z0-9\s]', '', string)

    # remove extra spaces
    string = re.sub('\s+', ' ', string)

    return string.upper().strip()

df_train['itemDescription'] = df_train['itemDescription'].apply(remove_color)
df_test['itemDescription'] = df_test['itemDescription'].apply(remove_color)

In [None]:
print('After Cleaning')
print('Train Data')
print(f"Num of Unique Items by Name: {df_train.itemDescription.str.lower().nunique()}\n")

print("Test Data")
print(f"Num of Unique Items by Name: {df_test.itemDescription.str.lower().nunique()}\n")

print("All Data")
print(f"Num of Unique Items by Name: {pd.concat([df_train, df_test]).itemDescription.str.lower().nunique()}")

In [None]:
train_items = df_train.itemDescription.str.lower().unique()
test_items = df_test.itemDescription.str.lower().unique()

test_not_in_train_items = len([x for x in test_items if x not in train_items])
print(f"Number of items in test but not in train: {test_not_in_train_items}")

In [None]:
#check duplicates in train dataset
df_train.duplicated().sum()

In [None]:

#check duplicates in test dataset
df_test.duplicated().sum()

In [None]:
#drop duplicates
df_train = df_train.drop_duplicates()
df_test= df_test.drop_duplicates()

In [None]:
df_train = df_train.applymap(lambda x: x.lower() if isinstance(x, str) else x)
df_test = df_test.applymap(lambda x: x.lower() if isinstance(x, str) else x)

In [None]:
# Drop unnecessary columns
df_train = df_train[['Member_number', 'Date', 'itemDescription']]
df_test = df_test[['Member_number', 'Date', 'itemDescription']]

### 2.5.3 Long tail problems

In [None]:
# Top 10 Popular Sold Items
df = df_train.groupby(["itemDescription"])["itemDescription"].count().sort_values(axis=0, ascending=False)
df = df[:167,].sort_values(ascending=True)

# Print as a table
print(df.to_string())


In [None]:
df.describe()

In [None]:
fig = plt.figure(figsize=(10, 6))
ax = df.plot(kind='bar')
# Reset index to use integers instead of item names
df = df.reset_index(drop=True)
plt.axhline(y=10, color='red', linestyle='dashed', label='Q2 = 10')
# Plot the long tail bar chart
df.plot(kind='bar')
ax.set_xticks([0, len(df)-1])
plt.ylabel('Number of Sold Items')
plt.xlabel('Item Index')
plt.title('Long Tail Bar Chart of Item Sales')
plt.show()

In [None]:
threshold = 10

# Count the number of buys for each item
buy_counts = df_train['itemDescription'].value_counts()

# Get the list of items in the long tail
long_tail_items = buy_counts[buy_counts < threshold].index.tolist()

print(f'Number of Long Tail Items: {len(long_tail_items)}')

### 2.6 Data Visualization

In [None]:
# Top 10 Popular Sold Items
df= df_train.groupby(["itemDescription"])["itemDescription"].count().sort_values(axis= 0,ascending =False)
df = df[:10,].sort_values(ascending = True)
df.plot(kind = "barh")
plt.ylabel('Itmes')
plt.xlabel('Number of Sold Items')
plt.title('Top 10 Popular Sold Items')

In [None]:
# Top 10 Member number
df= df_train.groupby(["Member_number"])["Member_number"].count().sort_values(axis= 0,ascending =False)
df = df[:10,].sort_values(ascending = True)
df.plot(kind = "barh")
plt.ylabel('Member Number')
plt.xlabel('Number of Purchased Items')
plt.title('Top 10 Member number')

## Frequent pattern mining

In [None]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth
import seaborn as sns
from mlxtend.frequent_patterns import apriori, association_rules

from mlxtend.preprocessing import TransactionEncoder
test_transactions = df_test.groupby('Member_number')['itemDescription'].apply(list).tolist()
train_transactions = df_train.groupby('Member_number')['itemDescription'].apply(list).tolist()

In [None]:
te = TransactionEncoder()
te.fit(train_transactions)
train_ary = te.transform(train_transactions)
train_df = pd.DataFrame(train_ary, columns=te.columns_)
test_ary = te.transform(test_transactions)
test_df = pd.DataFrame(test_ary, columns=te.columns_)

In [None]:
from mlxtend.frequent_patterns import apriori
test_frequentitemsets = apriori(test_df, min_support=0.01, use_colnames=False)
train_frequentitemsets = apriori(train_df, min_support=0.01, use_colnames=False)
print(test_frequentitemsets.head(10))

In [None]:
test_basket = (df_test
          .groupby(['Member_number', 'itemDescription'])['itemDescription']
          .count().unstack().reset_index().fillna(0)
          .set_index('Member_number'))

In [None]:
train_basket = (df_train
          .groupby(['Member_number', 'itemDescription'])['itemDescription']
          .count().unstack().reset_index().fillna(0)
          .set_index('Member_number'))

In [None]:
test_basket_sets = test_basket.applymap(lambda x: 1 if x > 0 else 0)
test_rules = association_rules(test_frequentitemsets, metric="confidence", min_threshold=0.1)
print(test_frequentitemsets.sort_values(by='support', ascending=False).head())
print(test_rules.sort_values(by='confidence', ascending=False).head())
test_results = list(test_rules.itertuples(index=False, name=None))
test_results_df = pd.DataFrame(test_results)

print(test_results_df)
data_chunk = test_results_df
data_chunk.to_csv('association_rules.csv', index=False)

## Collaborative filtering

#### Create user-item matrix

In [None]:
# identify the users unique to the test set
unique_users = df_train['Member_number'].unique()
unique_items = df_train['itemDescription'].unique()

user_item_matrix = pd.DataFrame(index=unique_users, columns=unique_items)

# iterate over the rows in the train set and fill in the user-item matrix
for index, row in df_train.iterrows():
    user_id = row['Member_number']
    item_id = row['itemDescription']
    rating = np.array([1])
    user_item_matrix.loc[user_id, item_id] = rating

# fill in the missing values with 0
user_item_matrix.fillna(0, inplace=True)

# check the user-item matrix
user_item_matrix.iloc[:10, :20]

In [None]:
user_item_matrix_array = user_item_matrix.values
user_item_matrix_array

In [None]:
user_item_matrix_customers = user_item_matrix.index
user_item_matrix_customers


Implementation algorithm reference: the movie recommender system solution in week 10.

In [None]:
def similarity(user1, user2):
    # Calculate the dot product of the two user vectors
    dot_product = np.dot(user1, user2)
    # Calculate the magnitude of the two user vectors
    magnitude = np.sqrt(np.sum(user1 ** 2) * np.sum(user2 ** 2))
    # Calculate the similarity between the two users
    similarity = dot_product/magnitude
    return similarity

In [None]:
def predict_rating(user_ratings, item_ratings):
    # find the indices of the items that have been bought
    rated_indices = np.where(item_ratings != 0)[0]

    # get the quantity of the items that have been bought
    ratings = item_ratings[rated_indices]

    rated_users = user_ratings[rated_indices]

    # calculate similarities
    similarities = [similarity(user_ratings[0], rated_users[i]) for i in range(len(rated_indices))]

    # using similarities to calculate the weighted sum
    weighted_sum = np.dot(similarities, ratings)
    weighted_sum /= np.sum(similarities)
    return weighted_sum

In [None]:
def recommend_items(user_ratings, target_user):
    num_users, num_movies = user_ratings.shape

    # find the unboughted items
    unwatched_indices = np.where(user_ratings[target_user] == 0)[0]

    # predict the ratings for the unboughted items
    predicted_ratings = [predict_rating(user_ratings, user_ratings[:, movie_index]) for movie_index in unwatched_indices]

    # ranking
    sorted_indices = np.argsort(predicted_ratings)[::-1]

    top_items = sorted_indices[:15]
    # recommended_movies = [f"Items {i+1}" for i in top_items]

    recommended_movies = [user_item_matrix.columns[i] for i in top_items]

    return recommended_movies

###   Evaluation

There are a total of k items that are recommended to a user, and the MAP is the average of their precision at k. It is also known as the Normalized Discounted Cumulative Gain (NDCG), and it is used to measure the quality of ranking systems. Using the position of the relevant items, it measures the performance of a recommendation system (shaped.ai, n.d.) (Dhinakaran, 2023).

In [None]:
# Function to calculate Average Precision (AP) for a single user
def average_precision(actual, predicted, k=10):
    # Check if the actual list of items is empty
    if actual.size == 0:
        return 0.0

    actual_set = set(actual)  # Convert actual items to a set for faster lookup
    predicted = predicted[:k]  # Consider only the top-k predicted items
    score = 0.0  # Initialize the AP score
    num_hits = 0.0  # Count of true positives

    # Iterate through the predicted items
    for i, p in enumerate(predicted):
        if p in actual_set and p not in predicted[:i]:  # Check if the predicted item is a true positive
            num_hits += 1.0  # Increment the count of true positives
            score += num_hits / (i + 1.0)  # Update the AP score

    # Return the average precision
    return score / min(len(actual), k)

# Function to calculate Discounted Cumulative Gain (DCG)
def dcg(relevance, k=10):
    relevance = np.asarray(relevance)[:k]  # Consider only the top-k relevance scores
    if relevance.size:  # Check if the relevance array is not empty
        # Compute the DCG using the logarithmic discount
        return np.sum(relevance / np.log2(np.arange(2, relevance.size + 2)))
    return 0.0  # Return 0 if there are no relevance scores

# Function to calculate Normalized Discounted Cumulative Gain (NDCG)
def ndcg(actual, predicted, k=10):
    actual_set = set(actual)  # Convert actual items to a set for faster lookup

    # Create a binary relevance list where 1 indicates relevance and 0 indicates non-relevance
    relevance = [1 if item in actual_set else 0 for item in predicted]

    # Create the ideal relevance list by sorting in descending order
    ideal_relevance = sorted(relevance, reverse=True)

    # Calculate the ideal DCG and actual DCG
    ideal_dcg = dcg(ideal_relevance, k)
    actual_dcg = dcg(relevance, k)

    # Return the normalized DCG
    return actual_dcg / ideal_dcg if ideal_dcg > 0 else 0

In [None]:
item_total_purchase = df_test.groupby(["itemDescription"])["itemDescription"].count().sort_values(axis= 0,ascending =False)

popular_items = item_total_purchase[:10,].sort_values(ascending = True)

df_test = df_test[df_test['itemDescription'].isin(popular_items.index)]

In [None]:
df_test.info()

In [None]:
# identify the users unique to the test set
unique_users = df_test['Member_number'].unique()
unique_items = df_test['itemDescription'].unique()

user_item_matrix = pd.DataFrame(index=unique_users, columns=unique_items)

# iterate over the rows in the train set and fill in the user-item matrix
for index, row in df_test.iterrows():
    user_id = row['Member_number']
    item_id = row['itemDescription']
    rating = np.array([1])
    user_item_matrix.loc[user_id, item_id] = rating

# fill in the missing values with 0
user_item_matrix.fillna(0, inplace=True)

# check the user-item matrix
user_item_matrix.iloc[:10, :20]

In [None]:
user_item_matrix_array = user_item_matrix.values
user_item_matrix_array

In [None]:
user_item_matrix_customers = user_item_matrix.index
user_item_matrix_customers

In [None]:
# Randomly select 5 users from the test set
np.random.seed(42)  # Set seed for reproducibility
random_users = np.random.choice(range(user_item_matrix_array.shape[0]), 5, replace=False)
print(f"Random Member_number from test dataset:\n{random_users}")

## Collaborative Filtering---- Recommendation

In [None]:
total_precision = 0
total_recall = 0
total_map = 0
total_ndcg = 0
total_users = 0


for user in random_users:
    recommended_items = recommend_items(user_item_matrix_array, user)
    be_recommended_user = user_item_matrix_customers[user]
    actual_items = df_test[df_test['Member_number'] == be_recommended_user]['itemDescription'].unique()

    # calculate the true positive and false positive
    tp = len(set(recommended_items) & set(actual_items))
    fp = len(recommended_items) - tp

    if len(actual_items) == 0:
        continue

    # calculate the recall
    fn = len(actual_items) - tp
    recall = tp / (tp + fn)

    # calculate the precision
    if (tp + fp) > 0:
        precision = tp / (tp + fp)
        total_precision += precision
        total_recall += recall

    # calculate MAP
    ap = average_precision(actual_items, recommended_items)
    total_map += ap

    # calculate NDCG
    ndcg_score = ndcg(actual_items, recommended_items)
    total_ndcg += ndcg_score

    total_users += 1

    print("User ID:", be_recommended_user)
    print("Recommended items:", recommended_items)
    print("Actual items:", actual_items)
    print("AP:", ap)
    print("NDCG:", ndcg_score)
    print()

average_precision = total_precision / total_users if total_users > 0 else 0
average_recall = total_recall / total_users if total_users > 0 else 0
mean_average_precision = total_map / total_users if total_users > 0 else 0
mean_ndcg = total_ndcg / total_users if total_users > 0 else 0

print("Average Precision:", average_precision)
print("Average Recall:", average_recall)
print("Mean Average Precision (MAP):", mean_average_precision)
print("Mean NDCG:", mean_ndcg)

## Pattern --- recommandations

In [None]:
#Here we use the association rule generated by groupmate 1. We read the rule from the csv file and generate the recommandations
rules= pd.read_csv('https://raw.githubusercontent.com/LDSamuelGuo/ANLP/main/formatted_association_rules.csv')

rules.columns

In [None]:
test_rules =pd.read_csv("https://raw.githubusercontent.com/LDSamuelGuo/ANLP/main/formatted_association_rules.csv")
test_rules["antecedents"]=test_rules["antecedents"].replace("frozenset({''})", '')
test_rules

In [None]:
#get recommandations from association rule
def rule_recommandation(rules,user_itemset,n):
    rules = rules.sort_values(by='confidence', ascending=False)
    filtered_rules = rules[rules['antecedents'] == user_itemset]
    # Select top recommendations from the filtered rules
    if (len(filtered_rules)==0):
        confidence=0
        top_recommendations=""
    else:
        top_recommendations = filtered_rules.head(n)['consequents'].tolist()
        confidence = filtered_rules.head(n)['confidence'].tolist()
    return confidence,top_recommendations

In [None]:
confidence,rec= rule_recommandation(rules,"frozenset({'whole milk'})",5)
result=pd.DataFrame(columns=["Recommandation","Confidence"])
result["Recommandation"]=rec
result["Confidence"]=confidence

In [None]:
#result from association rule pattern
result

In [None]:
new_test_data = df_train.reset_index(drop=True)
result=pd.DataFrame(columns=["itemDescription","predicted_item","Confidence"])
for i in range(len(new_test_data)):
    tmp=new_test_data.loc[i]
    item=tmp["itemDescription"]
    frozen_set = str(frozenset([item]))
    confidence,rec= rule_recommandation(rules,frozen_set,1)
    rec=str(rec)
    res={"itemDescription":item,"predicted_item":rec,"Confidence":confidence}
    result = pd.concat([result, pd.DataFrame(res, index=[0])], ignore_index=True)

In [None]:
result

In [None]:
result_df = result[result['Confidence'] != 0]
result_df

In [None]:
# Top 10 Popular Sold Items
df= result_df.groupby(["predicted_item"])["predicted_item"].count().sort_values(axis= 0,ascending =False)
df = df[:10,].sort_values(ascending = True)
df.plot(kind = "barh")
plt.ylabel('Itmes')
plt.xlabel('Number of Predicted Items')
plt.title('Top 10 Popular Predicted Items')

In [None]:
#test_data recommendation, Top 5 for each item
new_test_data = df_test.reset_index(drop=True)
result=pd.DataFrame(columns=["itemDescription","predicted_item","Confidence"])
for i in range(len(new_test_data)):
    tmp=new_test_data.loc[i]
    item=tmp["itemDescription"]
    frozen_set = str(frozenset([item]))
    confidence,rec= rule_recommandation(rules,frozen_set,1)
    rec=str(rec)
    res={"itemDescription":item,"predicted_item":rec,"Confidence":confidence}
    result = pd.concat([result, pd.DataFrame(res, index=[0])], ignore_index=True)
result_df = result[result['Confidence'] != 0]
result_df
# Top 10 Popular Sold Items
df= result_df.groupby(["predicted_item"])["predicted_item"].count().sort_values(axis= 0,ascending =False)
df = df[:10,].sort_values(ascending = True)
df.plot(kind = "barh")
plt.ylabel('Itmes')
plt.xlabel('Number of Predicted Items')
plt.title('Top 10 Popular Predicted Items')

In [None]:
result_df

## References

* www.shaped.ai. (n.d.). Evaluating recommendation systems (mAP, MMR, NDCG) | Shaped Blog. [online] Available at: https://www.shaped.ai/blog/evaluating-recommendation-systems-map-mmr-ndcg.
* Dhinakaran, A. (2023). Demystifying NDCG. [online] Medium. Available at: https://towardsdatascience.com/demystifying-ndcg-bee3be58cfe0.