## 1. Read the dataset

In [176]:
import pandas as pd

train_data_original = pd.read_csv("basket_data_by_date_train.csv")
test_data_original = pd.read_csv("basket_data_by_date_test.csv")


Check the structure and content of the dataset

In [177]:
print("Train Data Info:")
print(train_data_original.info())

print("\nTest Data Info:")
print(test_data_original.info())

Train Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   BillNo      40000 non-null  int64  
 1   Itemname    40000 non-null  object 
 2   Quantity    40000 non-null  int64  
 3   Date        40000 non-null  object 
 4   Price       40000 non-null  float64
 5   CustomerID  40000 non-null  int64  
 6   cost        40000 non-null  float64
dtypes: float64(2), int64(3), object(2)
memory usage: 2.1+ MB
None

Test Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   BillNo      10000 non-null  float64
 1   Itemname    10000 non-null  object 
 2   Quantity    10000 non-null  float64
 3   Date        10000 non-null  object 
 4   Price       10000 non-null  float64
 5   CustomerID  10000 non-

In [178]:
test_data_original.describe()

Unnamed: 0,BillNo,Quantity,Price,CustomerID,cost
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,545209.692,3.6721,3.971454,15433.0829,11.840678
std,492.638789,2.639788,6.777596,1616.660727,13.086198
min,544398.0,1.0,0.16,12352.0,0.19
25%,544795.0,2.0,1.65,14227.0,3.75
50%,545181.0,3.0,2.95,15358.0,9.9
75%,545664.0,6.0,4.95,16931.0,16.5
max,546084.0,10.0,195.0,18283.0,290.0


In [179]:
train_data_original.describe()

Unnamed: 0,BillNo,Quantity,Price,CustomerID,cost
count,40000.0,40000.0,40000.0,40000.0,40000.0
mean,540254.879225,3.4877,3.732165,15577.606525,11.097411
std,2380.444952,2.611766,5.71163,1730.347123,13.298301
min,536365.0,1.0,0.1,12347.0,0.14
25%,538093.0,1.0,1.65,14224.0,3.3
50%,540373.0,2.0,2.95,15570.0,7.95
75%,542360.0,6.0,4.65,17220.0,15.8
max,544398.0,10.0,295.0,18283.0,527.7


## 2. Pre-process the data

#### 2.1 Clean the missing value

In [180]:
# check if there are missing values in the train set
print("Train Data Missing Values:")
print(train_data_original.isnull().sum())

# check if there are missing values in the test set
print("\nTest Data Missing Values:")
print(test_data_original.isnull().sum())

Train Data Missing Values:
BillNo        0
Itemname      0
Quantity      0
Date          0
Price         0
CustomerID    0
cost          0
dtype: int64

Test Data Missing Values:
BillNo        30000
Itemname      30000
Quantity      30000
Date          30000
Price         30000
CustomerID    30000
cost          30000
dtype: int64


In [181]:
test_data_clean = test_data_original

In [182]:
# 删除从第10001行开始的缺失值
test_data_clean = test_data_original.iloc[:10000]

# 检查删除缺失值后的数据集大小
print("The size after cleaned the missing values", test_data_clean.shape)

The size after cleaned the missing values (10000, 7)


In [183]:
test_data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   BillNo      10000 non-null  float64
 1   Itemname    10000 non-null  object 
 2   Quantity    10000 non-null  float64
 3   Date        10000 non-null  object 
 4   Price       10000 non-null  float64
 5   CustomerID  10000 non-null  float64
 6   cost        10000 non-null  float64
dtypes: float64(5), object(2)
memory usage: 547.0+ KB


#### 2.2 Remove not related columns

In [184]:
# delete unrelated columns
train_data_drop = train_data_original.drop(columns=['BillNo', 'Date', 'Price', 'cost'])
test_data = test_data_clean.drop(columns=['BillNo', 'Date', 'Price', 'cost'])

In [185]:
train_data_drop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Itemname    40000 non-null  object
 1   Quantity    40000 non-null  int64 
 2   CustomerID  40000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 937.6+ KB


In [155]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Itemname    10000 non-null  object 
 1   Quantity    10000 non-null  float64
 2   CustomerID  10000 non-null  float64
dtypes: float64(2), object(1)
memory usage: 234.5+ KB


## 新添加：需要找出test和training中没有交集的customerID和itemname，并且把他们给drop掉

traindata太长了所以要缩短

In [186]:
train_data = train_data_drop

In [187]:
import pandas as pd

# 找到测试数据和训练数据中共同出现的商品名称和顾客ID
common_item_names = set(test_data['Itemname']).intersection(set(train_data['Itemname']))
common_customer_ids = set(test_data['CustomerID']).intersection(set(train_data['CustomerID']))

test_data = test_data[test_data['Itemname'].isin(common_item_names)]
test_data = test_data[test_data['CustomerID'].isin(common_customer_ids)]

train_data = train_data[train_data['Itemname'].isin(common_item_names)]
train_data = train_data[train_data['CustomerID'].isin(common_customer_ids)]

In [188]:
# 找出train_data中存在但test_data中不存在的商品名称和顾客ID
train_item_names = set(train_data['Itemname'])
train_customer_ids = set(train_data['CustomerID'])

extra_item_names_train = train_item_names - common_item_names
extra_customer_ids_train = train_customer_ids - common_customer_ids

# 找出test_data中存在但train_data中不存在的商品名称和顾客ID
test_item_names = set(test_data['Itemname'])
test_customer_ids = set(test_data['CustomerID'])

extra_item_names_test = test_item_names - common_item_names
extra_customer_ids_test = test_customer_ids - common_customer_ids

print("Train data中存在但test data中不存在的商品名称:", extra_item_names_train)
print("Train data中存在但test data中不存在的顾客ID:", extra_customer_ids_train)

print("Test data中存在但train data中不存在的商品名称:", extra_item_names_test)
print("Test data中存在但train data中不存在的顾客ID:", extra_customer_ids_test)


Train data中存在但test data中不存在的商品名称: set()
Train data中存在但test data中不存在的顾客ID: set()
Test data中存在但train data中不存在的商品名称: set()
Test data中存在但train data中不存在的顾客ID: set()


#### 2.3 customer-item matrix

Now I will convert the data into the form of customer-item matrix

In [189]:
len(train_data.CustomerID.unique()), len(train_data.Itemname.unique())

(248, 1397)

In [193]:
import pandas as pd

# 使用 pivot_table 构建用户-商品矩阵
user_item_matrix = pd.pivot_table(train_data, index='CustomerID', columns='Itemname', aggfunc=len, fill_value=0)

# 打印用户-商品矩阵
user_item_matrix

Unnamed: 0_level_0,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity
Itemname,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 DAISY PEGS IN WOOD BOX,12 EGG HOUSE PAINTED WOOD,12 IVORY ROSE PEG PLACE SETTINGS,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS SMALL TUBE SKULL,12 PENCILS TALL TUBE RED RETROSPOT,...,YELLOW FLOWERS FELT HANDBAG KIT,YELLOW GIANT GARDEN THERMOMETER,YELLOW METAL CHICKEN HEART,YELLOW SHARK HELICOPTER,YOU'RE CONFUSING ME METAL SIGN,ZINC FINISH 15CM PLANTER POTS,ZINC HEART LATTICE CHARGER LARGE,ZINC HEART LATTICE T-LIGHT HOLDER,ZINC METAL HEART DECORATION,ZINC WILLIE WINKIE CANDLE STICK
CustomerID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
12352,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12415,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12417,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12423,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12431,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18075,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18118,0,0,0,0,2,1,2,1,1,0,...,2,0,0,0,0,1,0,0,0,0
18198,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18223,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 3. Calculate the similarity

In [194]:
from sklearn.metrics.pairwise import cosine_similarity

# compute the cosine similarity matrix
user_similarity_matrix = cosine_similarity(user_item_matrix)

# convert the similarity matrix into a DataFrame
user_similarity_df = pd.DataFrame(user_similarity_matrix, index=user_item_matrix.index, columns=user_item_matrix.index)

print(user_similarity_df.iloc[:10, :7])

CustomerID     12352  12415     12417    12423     12431     12437     12464
CustomerID                                                                  
12352       1.000000    0.0  0.074536  0.00000  0.000000  0.000000  0.000000
12415       0.000000    1.0  0.000000  0.00000  0.000000  0.000000  0.000000
12417       0.074536    0.0  1.000000  0.00000  0.000000  0.000000  0.105409
12423       0.000000    0.0  0.000000  1.00000  0.081650  0.000000  0.000000
12431       0.000000    0.0  0.000000  0.08165  1.000000  0.000000  0.103280
12437       0.000000    0.0  0.000000  0.00000  0.000000  1.000000  0.000000
12464       0.000000    0.0  0.105409  0.00000  0.103280  0.000000  1.000000
12471       0.000000    0.0  0.030429  0.00000  0.000000  0.000000  0.057735
12474       0.000000    0.0  0.055556  0.00000  0.027217  0.000000  0.000000
12523       0.000000    0.0  0.000000  0.00000  0.000000  0.707107  0.000000


In [195]:
user_item_matrix = pd.pivot_table(train_data, index='CustomerID', columns='Itemname', aggfunc=len)
user_item_matrix

Unnamed: 0_level_0,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity
Itemname,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 DAISY PEGS IN WOOD BOX,12 EGG HOUSE PAINTED WOOD,12 IVORY ROSE PEG PLACE SETTINGS,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS SMALL TUBE SKULL,12 PENCILS TALL TUBE RED RETROSPOT,...,YELLOW FLOWERS FELT HANDBAG KIT,YELLOW GIANT GARDEN THERMOMETER,YELLOW METAL CHICKEN HEART,YELLOW SHARK HELICOPTER,YOU'RE CONFUSING ME METAL SIGN,ZINC FINISH 15CM PLANTER POTS,ZINC HEART LATTICE CHARGER LARGE,ZINC HEART LATTICE T-LIGHT HOLDER,ZINC METAL HEART DECORATION,ZINC WILLIE WINKIE CANDLE STICK
CustomerID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
12352,,,,,,,,,,,...,,,,,,,,,,
12415,,,,,,,,,,,...,,,,,,,,,,
12417,,,,,,,,,,,...,,,,,,,,,,
12423,,,,,,,,,,,...,,,,,,,,,,
12431,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18075,,,,,,,,,,,...,,,,,,,,,,
18118,,,,,2.0,1.0,2.0,1.0,1.0,,...,2.0,,,,,1.0,,,,
18198,,,,,,,,,,,...,,,,,,,,,,
18223,,,,,,,,,,,...,,,,,,,,,,


## 4. Methods for making recommendation

#### 4.1 get the top similar customers
Here we will use the number of purchases made by the customer as a rating, the more purchases made, the higher the rating.

In [196]:
def top_similar_users(user_similarity_matrix, user_id, n):
    similar_users = user_similarity_matrix.loc[user_id]
    
    # get the index of the top n similar users
    top_n_similar_users = similar_users.sort_values(ascending=False).iloc[1:n+1].index
    return top_n_similar_users


#### 4.2 predict the rating for each product
For each item not rated by the target user, the rating is predicted by calculating the weighted average of the ratings given by the most similar users, where the weight is the similarity between the user and the target user.

In [197]:
def predict_user_based(user_item_matrix, user_similarity_matrix, user_id, item_id):
    # get the top 20 similar users
    top_sim_users = top_similar_users(user_similarity_matrix, user_id, 20)
    
    user_ratings = user_item_matrix.loc[user_id]
    
    mean_rating = user_ratings.mean()
    
    # compute the predicted rating
    pred_rating = 0
    sim_sum = 0
    
    for other_user in top_sim_users:
        other_user_rating = user_item_matrix.loc[other_user, item_id]
        if other_user_rating > 0:
            other_user_mean = user_item_matrix.loc[other_user].mean()
            sim_user = user_similarity_matrix.loc[user_id, other_user]
            pred_rating += sim_user * (other_user_rating - other_user_mean)
            sim_sum += sim_user
    
    if sim_sum > 0:
        pred_rating = mean_rating + pred_rating / sim_sum
    else:
        pred_rating = mean_rating
    
    return pred_rating

#### 4.3 Recommends the top N items with the highest predictive scores

In [198]:
def recommend_top_n_items(train_user_item_matrix, user_similarity_matrix, test_set, N):
    recommendations = pd.DataFrame(index=test_set.index, columns=['Top_Items'])
    
    # 使用训练集中的用户项目矩阵和用户相似性矩阵
    for user_id in test_set.index:
        # 获取该用户在训练集中尚未评分的物品
        unrated_items = train_user_item_matrix.loc[user_id][train_user_item_matrix.loc[user_id] == 0].index
        
        user_predictions = {}
        for item_id in unrated_items:
            user_predictions[item_id] = predict_user_based(train_user_item_matrix, user_similarity_matrix, user_id, item_id)
        
        # 按预测评分排序
        sorted_predictions = sorted(user_predictions.items(), key=lambda x: x[1], reverse=True)
        
        # 获取前 N 个物品
        top_n_items = [item[0] for item in sorted_predictions[:N]]
        
        recommendations.loc[user_id, 'Top_Items'] = top_n_items
    
    return recommendations


## 6. Five recommended examples of what users actually buy

In [203]:
print(user_item_matrix.index)  # 打印 DataFrame df 的索引值
print(user_similarity_df.index)  # 打印 Series series 的索引值
user_item_matrix.index.equals(user_similarity_df.index)



Index([12352, 12415, 12417, 12423, 12431, 12437, 12464, 12471, 12474, 12523,
       ...
       18041, 18044, 18055, 18061, 18065, 18075, 18118, 18198, 18223, 18283],
      dtype='int64', name='CustomerID', length=248)
Index([12352, 12415, 12417, 12423, 12431, 12437, 12464, 12471, 12474, 12523,
       ...
       18041, 18044, 18055, 18061, 18065, 18075, 18118, 18198, 18223, 18283],
      dtype='int64', name='CustomerID', length=248)


True

In [204]:
# 重新设置索引并删除第一个索引位置
user_item_matrix = user_item_matrix.reset_index(drop=True)
user_similarity_df = user_similarity_df.reset_index(drop=True)


In [205]:
test_recommendations = recommend_top_n_items(user_item_matrix, user_similarity_df, test_data, N=20)

KeyError: 248

In [412]:
for user_id, row in test_recommendations.iterrows():
    print("Top 5 recommended items for user", user_id, ":", row['Top_Items'])

Top 5 recommended items for user 15311.0 : ['RIBBON REEL HEARTS DESIGN', 'RIBBON REEL LACE DESIGN', 'JUMBO BAG SCANDINAVIAN PAISLEY', 'RED RETROSPOT WASHBAG', 'JUMBO STORAGE BAG SUKI']
Top 5 recommended items for user 12594.0 : ['BALLOON ART MAKE YOUR OWN FLOWERS', 'LUNCH BAG  BLACK SKULL.', 'LUNCH BAG WOODLAND', 'SKULL SHOULDER BAG', 'CHARLOTTE BAG SUKI DESIGN']
Top 5 recommended items for user 16401.0 : ['JUMBO STORAGE BAG SUKI', 'TRAVEL CARD WALLET KEEP CALM', 'TRAVEL CARD WALLET RETROSPOT', 'GRAND CHOCOLATECANDLE', 'HANGING HEN ON NEST DECORATION']
Top 5 recommended items for user 16814.0 : ['FROSTED WHITE BASE', 'JUMBO BAG PINK POLKADOT', 'TRAVEL SEWING KIT', 'MAGIC DRAWING SLATE SPACEBOY', 'WATERING CAN GARDEN MARKER']
Top 5 recommended items for user 16726.0 : ['TEA TIME OVEN GLOVE', 'TOILET METAL SIGN', 'LUNCH BAG SUKI  DESIGN', 'RIDGED GLASS T-LIGHT HOLDER', 'GRAND CHOCOLATECANDLE']
Top 5 recommended items for user 16265.0 : ['DOORMAT RED RETROSPOT', 'HEN HOUSE W CHICK STANDIN

## Evaluation

In [None]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5112 entries, 0 to 9982
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Itemname    5112 non-null   object 
 1   Quantity    5112 non-null   float64
 2   CustomerID  5112 non-null   float64
dtypes: float64(2), object(1)
memory usage: 159.8+ KB


In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10140 entries, 61 to 39999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Itemname    10140 non-null  object
 1   Quantity    10140 non-null  int64 
 2   CustomerID  10140 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 316.9+ KB


In [419]:
def evaluate_recommendations(recommendations, test_data):
    total_correct_predictions = 0
    total_predictions = 0

    for user_id, row in recommendations.iterrows():
        recommended_items = row['Top_Items']
        actual_items = test_data[test_data['CustomerID'] == user_id]['Itemname'].tolist()
        
        # print("User ID:", user_id)
        # print("Actual Items:", actual_items)
        # print("Recommended Items:", recommended_items)

        correct_predictions = len(set(recommended_items) & set(actual_items))
        
        total_correct_predictions += correct_predictions
        total_predictions += len(recommended_items)
    
    accuracy = total_correct_predictions / total_predictions
    return accuracy

# 调用评估函数
accuracy = evaluate_recommendations(test_recommendations, test_data)
print("Accuracy:", accuracy)

Accuracy: 0.0


In [417]:
test_recommendations.info()

<class 'pandas.core.frame.DataFrame'>
Index: 250 entries, 15311.0 to 14534.0
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Top_Items  250 non-null    object
dtypes: object(1)
memory usage: 12.0+ KB


In [418]:
def compute_precision(test_set, recommendations):
    total_users = len(test_set)
    correct_predictions = 0
    
    for user_id in test_set.index:
        actual_items = test_set.loc[user_id, 'Itemname']  # 假设 'Itemname' 列包含实际物品
        
        recommended_items = recommendations.loc[user_id, 'Top_Items']
        
        # 计算推荐物品与实际物品的交集数量
        intersection = len(set(actual_items) & set(recommended_items))
        
        # 更新正确预测的数量
        correct_predictions += intersection
        
    # 计算准确率
    precision = correct_predictions / (total_users * len(recommended_items))
    
    return precision

# 计算准确率
precision = compute_precision(test_data, test_recommendations)
print("Precision:", precision)


ZeroDivisionError: division by zero