### 用来将csv文件制作成透视表

In [2]:
# 过滤csv的标签
import pandas as pd

# 指定你的CSV文件路径
csv_file_path = 'bert_reduce_merged_file_title.csv'

# 读取CSV文件
df = pd.read_csv(csv_file_path, usecols=['user_id', 'bert_rating', 'title_without_series'])

# 指定新CSV文件的保存路径
new_csv_file_path = 'pivot_use.csv'

# 导出DataFrame为CSV
df.to_csv(new_csv_file_path, index=False)



In [2]:
import pandas as pd

# 替换为你的文件路径
file_path = 'pivot_use.csv'

# 使用pandas读取前几行
df = pd.read_csv(file_path, nrows=5)  # 这里的5表示读取前5行，你可以根据需要调
print(df)



# 初始化计数器
row_count = 0

# 以块的方式读取文件
for chunk in pd.read_csv(file_path, chunksize=10000):  # 这里的10000是每个块的大小，你可以根据需要和内存限制调整
    row_count += len(chunk)

print(f'Total number of rows: {row_count}')


                            user_id                      title_without_series  \
0  cfa37f8ff40f4c05fe58462a0344aa27        Scarlet (The Lunar Chronicles, #2)   
1  6a3a8c4abf9dc036d2a652fd364a4556                   New Moon (Twilight, #2)   
2  3d7107325f0bd2bd27a34904aededac3         Winter (The Lunar Chronicles, #4)   
3  e7a00ca7bc7dc46217540ffce134f573  Angelfall (Penryn & the End of Days, #1)   
4  f05cc5cae2060dbb24fa88313b28783f                      Let the Right One In   

   bert_rating  
0            5  
1            4  
2            3  
3            5  
4            4  
Total number of rows: 1141536


In [1]:
# 创建csr矩阵做协同过滤
import pandas as pd
from scipy.sparse import csr_matrix

# 替换为你的CSV文件路径
file_path = 'pivot_use.csv'

# 使用pandas读取CSV文件
df = pd.read_csv(file_path)

# 将用户ID和书名映射为整数索引
df['user_id_idx'] = df['user_id'].astype('category').cat.codes
df['book_title_idx'] = df['title_without_series'].astype('category').cat.codes

# 确保所有索引都是非负的
assert df['user_id_idx'].min() >= 0, "user_id_idx contains negative values"
assert df['book_title_idx'].min() >= 0, "book_title_idx contains negative values"

# 创建CSR矩阵
ratings_csr = csr_matrix((df['bert_rating'], (df['user_id_idx'], df['book_title_idx'])), shape=(df['user_id_idx'].max() + 1, df['book_title_idx'].max() + 1))



In [2]:
ratings_csr

<174980x119868 sparse matrix of type '<class 'numpy.int64'>'
	with 1140550 stored elements in Compressed Sparse Row format>

In [6]:
# # 建立csr_matrix
# from scipy.sparse import csr_matrix
# import pandas as pd

# # 假设df是你的DataFrame

# # 将用户ID和书名映射为整数索引
# user_ids = pd.Series(df['user_id'].unique()).index.to_series().to_dict()
# book_titles = pd.Series(df['title_without_series'].unique()).index.to_series().to_dict()

# df['user_id_idx'] = df['user_id'].map(user_ids)
# df['book_title_idx'] = df['title_without_series'].map(book_titles)

# # 创建CSR矩阵
# ratings_csr = csr_matrix((df['bert_rating'], (df['user_id_idx'], df['book_title_idx'])), shape=(len(user_ids), len(book_titles)))


### 训练模型

In [7]:
# ! pip install surprise


In [8]:
# 爆内存
# from surprise import Dataset, Reader
# from surprise import KNNBasic
# from surprise.model_selection import train_test_split
# from surprise import accuracy
# import pickle

# # 为Surprise创建数据集
# reader = Reader(rating_scale=(1, 5))  # 假设评分在1到5之间
# data = Dataset.load_from_df(df[['user_id', 'title_without_series', 'bert_rating']], reader)

# # 划分数据集为训练集和测试集
# trainset, testset = train_test_split(data, test_size=0.25)

# # 使用KNN算法
# algo = KNNBasic()

# # 训练模型
# algo.fit(trainset)

# # 在测试集上评估模型
# predictions = algo.test(testset)
# accuracy.rmse(predictions)

# # 保存模型
# file_name = 'collaborative_filtering_model.pkl'
# with open(file_name, 'wb') as file:
#     pickle.dump(algo, file)



In [9]:
# 还是不行
# from surprise import SVD
# from surprise.model_selection import train_test_split
# from surprise import accuracy

# # 使用SVD算法
# algo = SVD()

# # 划分数据集为训练集和测试集
# trainset, testset = train_test_split(data, test_size=0.25)

# # 训练模型
# algo.fit(ratings_csr)

# # 在测试集上评估模型
# predictions = algo.test(testset)
# print(f"RMSE: {accuracy.rmse(predictions)}")


In [6]:
# 验证可用性

from sklearn.neighbors import NearestNeighbors

# 初始化NearestNeighbors模型
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)

# 训练模型
model_knn.fit(ratings_csr)

# 定义一个函数来为给定的user_id_idx进行推荐
def recommend_for_user(user_id_idx, ratings_csr, model_knn, n_recommendations=10):
    # 查询这个用户的k个最近邻居
    distances, indices = model_knn.kneighbors(ratings_csr[user_id_idx], n_neighbors=n_recommendations+1)
    
    # 返回最近邻居的索引和距离
    # 注意我们跳过第一个最近邻（即用户自己）
    return indices[0][1:], distances[0][1:]

# 假设我们要为user_id_idx=0的用户推荐书籍
user_id_to_idx = pd.Series(df['user_id_idx'].values, index=df['user_id']).to_dict()
user_id_str = '3d7107325f0bd2bd27a34904aededac3'
user_id_idx = user_id_to_idx[user_id_str]  # 将用户ID字符串转换为整数索引

indices, distances = recommend_for_user(user_id_idx, ratings_csr, model_knn)

print("Recommended Indices:", indices)
print("Distances:", distances)

# 你可以使用indices来获取书籍的标题或ID，这取决于你如何将索引映射回你的数据

# 将indices转为book_title
book_idx_to_title = pd.Series(df['title_without_series'].values, index=df['book_title_idx'].values).to_dict()

recommended_book_titles = [book_idx_to_title.get(idx, "Unknown Book") for idx in indices]

print("Recommended Book Titles:")
for title in recommended_book_titles:
    print(title)



Recommended Indices: [100536  56074  49050 142207  16732  32003   6833  69627   2237  69767]
Distances: [0.58040932 0.58040932 0.58462642 0.58462642 0.58462642 0.58462642
 0.58462642 0.58462642 0.58462642 0.59371986]
Recommended Book Titles:
The Secret Horses of Briar Hill
Madly, Deeply
Kerjäläisprinsessa (Gigi ja Henry, #1)
Unknown Book
Chaos (Čísla, #2)
Even Villains Have Interns (Heroes and Villains #3)
Any Price
Rainha das Trevas (Trilogia das Jóias Negras, #3)
A Place Without Shadows (Deadlock Trilogy, #2)
Rapunzel


In [5]:
! pip install streamlit joblib

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting streamlit
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/a9/56/b6879cab429dd9d907d624766e049ee286219751e3cd8611ebdf7fb66cc5/streamlit-1.32.1-py2.py3-none-any.whl (8.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.1/8.1 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
Collecting altair<6,>=4.0 (from streamlit)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/c5/e4/7fcceef127badbb0d644d730d992410e4f3799b295c9964a172f92a469c7/altair-5.2.0-py3-none-any.whl (996 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m996.9/996.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting blinker<2,>=1.0.0 (from streamlit)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/fa/2a/7f3714cbc6356a0efec525ce7a0613d581072ed6eb53eb7b9754f33db807/blinker-1.7.0-py3-none-any.whl (13 kB)
Collecting click<9,>=7.0 (from streamlit)
  Downl

In [7]:
# 训练模型
import joblib

# 保存模型
joblib.dump(model_knn, 'model_knn.pkl')

# 保存用户ID到索引的映射
joblib.dump(user_id_to_idx, 'user_id_to_idx.pkl')

# 保存书籍索引到书名的映射
joblib.dump(book_idx_to_title, 'book_idx_to_title.pkl')



['book_idx_to_title.pkl']

In [1]:
import streamlit as st
import joblib

# 加载模型和映射
model_knn = joblib.load('model_knn.pkl')
user_id_to_idx = joblib.load('user_id_to_idx.pkl')
book_idx_to_title = joblib.load('book_idx_to_title.pkl')

# Streamlit界面代码
st.title('书籍推荐系统')

# 假设ratings_csr也需要被加载或在这里以某种方式被构建

user_id_input = st.text_input('请输入您的用户ID:', '')

if user_id_input:
    user_id_idx = user_id_to_idx.get(user_id_input, None)
    if user_id_idx is not None:
        indices, distances = recommend_for_user(user_id_idx, ratings_csr, model_knn)
        recommended_books = [book_idx_to_title.get(idx, "Unknown Book") for idx in indices]
        st.write('为您推荐的书籍:')
        for book in recommended_books:
            st.write(book)
    else:
        st.write("未找到用户ID，请确保输入正确。")



ModuleNotFoundError: No module named 'streamlit'

In [2]:
! pip install streamlit

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting packaging<24,>=16.8 (from streamlit)
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/ec/1a/610693ac4ee14fcdf2d9bf3c493370e4f2ef7ae2e19217d7a237ff42367d/packaging-23.2-py3-none-any.whl (53 kB)
Installing collected packages: packaging
  Attempting uninstall: packaging
    Found existing installation: packaging 24.0
    Uninstalling packaging-24.0:
      Successfully uninstalled packaging-24.0
Successfully installed packaging-23.2


In [3]:
! python --version

zsh:1: command not found: python
