In [31]:
import json
import gzip
from collections import defaultdict

# 读取 Goodreads 子类数据
def load_reviews(file_path):
    reviews = []
    with gzip.open(file_path, 'rt') as f:
        for line in f:
            review = json.loads(line)
            reviews.append(review)
    return reviews

# 统计基本信息
def analyze_reviews(reviews):
    user_count = defaultdict(int)
    book_count = defaultdict(int)
    
    # 统计每个用户的评论次数和每本书的评论次数
    for review in reviews:
        user_id = review['user_id']
        book_id = review['book_id']
        user_count[user_id] += 1
        book_count[book_id] += 1
    
    total_reviews = len(reviews)
    total_users = len(user_count)
    total_books = len(book_count)
    
    print(f"总评论数: {total_reviews}")
    print(f"总用户数: {total_users}")
    print(f"总书籍数: {total_books}")
    
    # 分析评论次数分布
    user_review_counts = list(user_count.values())
    book_review_counts = list(book_count.values())
    
    print(f"\n用户评论数分析:")
    print(f"平均每个用户的评论数: {sum(user_review_counts) / len(user_review_counts):.2f}")
    print(f"最少评论数: {min(user_review_counts)}, 最多评论数: {max(user_review_counts)}")
    
    print(f"\n书籍评论数分析:")
    print(f"平均每本书的评论数: {sum(book_review_counts) / len(book_review_counts):.2f}")
    print(f"最少评论数: {min(book_review_counts)}, 最多评论数: {max(book_review_counts)}")

# 主流程
file_path = '/workspace/goodreads/goodreads_reviews_history_biography.json.gz'

print("加载数据中...")
reviews = load_reviews(file_path)

print("分析数据中...")
analyze_reviews(reviews)


加载数据中...
分析数据中...
总评论数: 2066193
总用户数: 238450
总书籍数: 302346

用户评论数分析:
平均每个用户的评论数: 8.67
最少评论数: 1, 最多评论数: 3368

书籍评论数分析:
平均每本书的评论数: 6.83
最少评论数: 1, 最多评论数: 11300


In [3]:
import json
import gzip

# 读取 Goodreads 子类数据的一个 JSON 对象
def print_first_review(file_path):
    with gzip.open(file_path, 'rt') as f:
        first_line = f.readline()  # 读取第一行数据
        first_review = json.loads(first_line)  # 将其转为 JSON 格式
        print(json.dumps(first_review, indent=2))  # 格式化输出 JSON 对象

# 主流程
file_path = '/workspace/goodreads/goodreads_reviews_history_biography.json.gz'

# 打印第一个 JSON 对象
print("打印第一个 JSON 对象中的特征:")
print_first_review(file_path)


打印第一个 JSON 对象中的特征:
{
  "user_id": "8842281e1d1347389f2ab93d60773d4d",
  "book_id": "29893493",
  "review_id": "c23406fb584d6304d1dd4c75ce26ea3a",
  "rating": 5,
  "review_text": "I haven't read a non-fiction book this engaging in some time. This was an amazingly well written autobiography. It read like a fast paced novel through much of it. Many autobiographies are too long - this one if anything is too short! Like many startup companies, the story of Nike (or Blue Ribbon Shoes as it was initially called) is one of trials, tribulations, and lots of passion and grit. I ate it up, and highly recommend it. \n The most defining moment of the story is Knights ballsy move while backpacking through Japan at the age of 24, to walk into a Japanese shoe manufacturer and say he has a shoe distribution company, and get a exclusive deal for the western US. Gutsy. He \"just did it\" (sorry, but apt). I love learning examples of this kind of \"Do things that don't scale\" start to successful companie

In [32]:
import json
import gzip
from collections import defaultdict

# 读取 Goodreads 子类数据
def load_reviews(file_path):
    reviews = []
    with gzip.open(file_path, 'rt') as f:
        for line in f:
            review = json.loads(line)
            reviews.append(review)
    return reviews

# 选出最受欢迎的前4000本书
def select_top_books(reviews, top_n=2608):
    book_count = defaultdict(int)
    
    # 统计每本书的评论次数
    for review in reviews:
        book_id = review['book_id']
        book_count[book_id] += 1
    
    # 按评论次数排序，选出前 top_n 本书
    top_books = sorted(book_count.items(), key=lambda x: x[1], reverse=True)[:top_n]
    top_book_ids = set([book_id for book_id, count in top_books])
    
    return top_books, top_book_ids

# 统计每个用户在前4000本书中的阅读情况
def analyze_user_reading(reviews, top_book_ids):
    user_top_book_count = defaultdict(int)
    
    # 按用户统计其阅读的前4000本书的数量
    for review in reviews:
        user_id = review['user_id']
        book_id = review['book_id']
        if book_id in top_book_ids:
            user_top_book_count[user_id] += 1
    
    return user_top_book_count

# 主流程
file_path = '/workspace/goodreads/goodreads_reviews_history_biography.json.gz'

print("加载数据中...")
reviews = load_reviews(file_path)

# 选出最受欢迎的前4000本书
top_books, top_book_ids = select_top_books(reviews, top_n=1608)

# 输出最受欢迎的书籍及其评论次数
print(f"最受欢迎的 1608 本书的评论次数分布:")
for book_id, count in top_books[:10]:  # 仅输出前10本书的评论数
    print(f"书籍ID: {book_id}, 评论次数: {count}")

# 统计每个用户在这 4000 本书中阅读的数量
user_top_book_count = analyze_user_reading(reviews, top_book_ids)

# 输出部分用户阅读情况
print(f"\n用户在前1608本书中的阅读情况:")
for user_id, count in list(user_top_book_count.items())[:10]:  # 仅输出前10个用户
    print(f"用户ID: {user_id}, 阅读涉及的4000本书中的数量: {count}")


加载数据中...
最受欢迎的 1608 本书的评论次数分布:
书籍ID: 19063, 评论次数: 11300
书籍ID: 4667024, 评论次数: 8239
书籍ID: 18143977, 评论次数: 7419
书籍ID: 2657, 评论次数: 7342
书籍ID: 43641, 评论次数: 6091
书籍ID: 10964, 评论次数: 5775
书籍ID: 21853621, 评论次数: 4564
书籍ID: 10644930, 评论次数: 4267
书籍ID: 2728527, 评论次数: 4257
书籍ID: 7445, 评论次数: 3937

用户在前1608本书中的阅读情况:
用户ID: 8842281e1d1347389f2ab93d60773d4d, 阅读涉及的4000本书中的数量: 23
用户ID: 72fb0d0087d28c832f15776b0d936598, 阅读涉及的4000本书中的数量: 1
用户ID: d986f354a045ffb91234e4af4d1b12fd, 阅读涉及的4000本书中的数量: 1
用户ID: 704eb93a316aff687a93d5215882eb21, 阅读涉及的4000本书中的数量: 1
用户ID: f4d16ea4ac59af59d257631398af39f4, 阅读涉及的4000本书中的数量: 1
用户ID: 4b3636a043e5c99fa27ac897ccfa1151, 阅读涉及的4000本书中的数量: 1
用户ID: 903d4b859e86a1dd6d7640849cc7067c, 阅读涉及的4000本书中的数量: 2
用户ID: afc070543f19028dc7e7f084a0079f72, 阅读涉及的4000本书中的数量: 6
用户ID: d92c94bda1ca3ec254f3aa95757c7831, 阅读涉及的4000本书中的数量: 1
用户ID: 7b2e5fe9fd353fecf3eeebb4850b88d3, 阅读涉及的4000本书中的数量: 3


In [30]:
# 按照用户阅读的热门书籍数量排序，并选出前 1000 个用户
def get_top_n_users(user_top_book_count, top_n=1120):
    sorted_users = sorted(user_top_book_count.items(), key=lambda x: x[1], reverse=True)  # 按照阅读量从大到小排序
    top_users = sorted_users[:top_n]  # 选出前 top_n 个用户
    return top_users

# 显示读最多和读最少的用户的阅读量
def display_min_max_user_reading(top_users):
    max_read_user = top_users[0]
    min_read_user = top_users[-1]
    
    print(f"读最多书的用户ID: {max_read_user[0]}, 阅读的4000本热门书中的数量: {max_read_user[1]}")
    print(f"读最少书的用户ID: {min_read_user[0]}, 阅读的4000本热门书中的数量: {min_read_user[1]}")

# 筛选出前 1000 个用户
top_n = 1120
top_users = get_top_n_users(user_top_book_count, top_n=top_n)

# 显示读最多书和读最少书的用户的阅读量
display_min_max_user_reading(top_users)

# 输出前1000个用户的阅读情况
print(f"\n前 {top_n} 用户在4000本热门书中的阅读情况:")
for user_id, count in top_users[1110:1120]:  # 仅显示前10个用户的阅读情况
    print(f"用户ID: {user_id}, 阅读涉及的4000本书中的数量: {count}")


读最多书的用户ID: 7b82d02a42678fbdaaee5e119981bdb8, 阅读的4000本热门书中的数量: 254
读最少书的用户ID: c0bf4b6a130946fe3f1a8e45ec372816, 阅读的4000本热门书中的数量: 46

前 1120 用户在4000本热门书中的阅读情况:
用户ID: cf0bacb5c3718016ce15c43943401b20, 阅读涉及的4000本书中的数量: 46
用户ID: 6792b018843768c7e72a7975127289b2, 阅读涉及的4000本书中的数量: 46
用户ID: 4add8c6a4c47e3eeebf93bfba72848b3, 阅读涉及的4000本书中的数量: 46
用户ID: 098f71df8945d3b3284dc2d0d0505724, 阅读涉及的4000本书中的数量: 46
用户ID: 00cdd405cd06c9d64ac273624252f00b, 阅读涉及的4000本书中的数量: 46
用户ID: 3057b4ba46a03796e64853a8682efbdb, 阅读涉及的4000本书中的数量: 46
用户ID: 559e950b58080e646b733dd2d305a0e2, 阅读涉及的4000本书中的数量: 46
用户ID: 735ad9df127bb15ba1c59fd2f5c00480, 阅读涉及的4000本书中的数量: 46
用户ID: 31b825f2b542020399e2e9706779cbe8, 阅读涉及的4000本书中的数量: 46
用户ID: c0bf4b6a130946fe3f1a8e45ec372816, 阅读涉及的4000本书中的数量: 46


In [36]:
import json
import gzip
from collections import Counter, defaultdict
from datetime import datetime
import heapq

def parse_date(date_string):
    if not date_string:
        return None
    try:
        return datetime.strptime(date_string, "%a %b %d %H:%M:%S %z %Y").timestamp()
    except ValueError:
        return None

def get_top_books(file_path, top_n=2608):
    book_counts = Counter()
    with gzip.open(file_path, 'rt') as f:
        for line in f:
            review = json.loads(line)
            book_counts[review['book_id']] += 1
    return dict(book_counts.most_common(top_n))

def get_user_reading_counts(file_path, top_books):
    user_reading_counts = defaultdict(int)
    with gzip.open(file_path, 'rt') as f:
        for line in f:
            review = json.loads(line)
            if review['book_id'] in top_books:
                user_reading_counts[review['user_id']] += 1
    return user_reading_counts

def get_top_users(user_reading_counts, top_n=1120):
    return dict(sorted(user_reading_counts.items(), key=lambda x: x[1], reverse=True)[:top_n])

def load_book_titles(file_path):
    book_titles = {}
    with gzip.open(file_path, 'rt') as f:
        for line in f:
            book = json.loads(line)
            book_titles[book['book_id']] = book['title']
    return book_titles

def process_and_save_data(reviews_file, books_file, output_file, top_books, top_users):
    book_titles = load_book_titles(books_file)
    user_data = defaultdict(list)
    
    with gzip.open(reviews_file, 'rt') as f:
        for line in f:
            review = json.loads(line)
            user_id = review['user_id']
            book_id = review['book_id']
            
            if user_id in top_users and book_id in top_books:
                read_at = parse_date(review.get('read_at')) or parse_date(review.get('date_added'))
                title = book_titles.get(book_id, "Unknown Title")
                user_data[user_id].append({
                    'book_id': book_id,
                    'title': title,
                    'read_at': read_at
                })
    
    # Sort each user's reading list by date
    for user_id, books in user_data.items():
        user_data[user_id] = sorted(books, key=lambda x: x['read_at'] or 0)
    
    with open(output_file, 'w') as f:
        json.dump(user_data, f)

# Main processing
reviews_file = '/workspace/goodreads/goodreads_reviews_history_biography.json.gz'
books_file = '/workspace/goodreads/goodreads_books.json.gz'
output_file = '/workspace/history/processed_ba_user_sessions.json'

print("Identifying top 4000 books...")
top_books = get_top_books(reviews_file)

print("Analyzing user reading patterns...")
user_reading_counts = get_user_reading_counts(reviews_file, top_books)

print("Selecting top 1000 users...")
top_users = get_top_users(user_reading_counts)

print("Processing and saving data...")
process_and_save_data(reviews_file, books_file, output_file, top_books, top_users)

# Print statistics
print(f"\nTop 4000 books:")
print(f"Most reviewed book: {max(top_books.values())} reviews")
print(f"Least reviewed book among top 4000: {min(top_books.values())} reviews")

print(f"\nTop 1000 users:")
print(f"User with most books read: {max(top_users.values())} books")
print(f"User with least books read among top 1000: {min(top_users.values())} books")

print(f"\nProcessed data saved to {output_file}")

Identifying top 4000 books...
Analyzing user reading patterns...
Selecting top 1000 users...
Processing and saving data...

Top 4000 books:
Most reviewed book: 11300 reviews
Least reviewed book among top 4000: 92 reviews

Top 1000 users:
User with most books read: 320 books
User with least books read among top 1000: 55 books

Processed data saved to /workspace/history/processed_ba_user_sessions.json


In [37]:
import json
import random
from collections import Counter
from datetime import datetime

def load_data(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

def analyze_data(data):
    user_book_counts = {user: len(books) for user, books in data.items()}
    reading_spans = {}
    books_counter = Counter()

    for user, books in data.items():
        if books:
            dates = [book['read_at'] for book in books if book['read_at']]
            if dates:
                reading_spans[user] = max(dates) - min(dates)
        books_counter.update([book['title'] for book in books])

    return user_book_counts, reading_spans, books_counter

def print_statistics(user_book_counts, reading_spans, books_counter):
    print(f"用户数: {len(user_book_counts)}")
    print(f"平均阅读量: {sum(user_book_counts.values()) / len(user_book_counts):.2f}")
    print(f"中位数阅读量: {sorted(user_book_counts.values())[len(user_book_counts)//2]}")
    print(f"最小阅读量: {min(user_book_counts.values())}")
    print(f"最大阅读量: {max(user_book_counts.values())}")

    avg_span = sum(reading_spans.values()) / len(reading_spans) / (24 * 3600)  # 转换为天
    print(f"\n平均阅读时间跨度: {avg_span:.2f} 天")

    print("\n最受欢迎的10本书:")
    for book, count in books_counter.most_common(10):
        print(f"{book}: {count} 次阅读")

    # 随机选择一个用户进行样本检查
    sample_user = random.choice(list(data.keys()))
    print(f"\n随机用户 {sample_user} 的前5条阅读记录:")
    for book in data[sample_user][:5]:
        print(f"书名: {book['title']}, 阅读时间: {datetime.fromtimestamp(book['read_at'])}")

# 主处理流程
file_path = '/workspace/history/processed_ba_user_sessions.json'
data = load_data(file_path)
user_book_counts, reading_spans, books_counter = analyze_data(data)
print_statistics(user_book_counts, reading_spans, books_counter)

用户数: 1120
平均阅读量: 82.03
中位数阅读量: 72
最小阅读量: 55
最大阅读量: 320

平均阅读时间跨度: 4488.00 天

最受欢迎的10本书:
The Help: 557 次阅读
All the Light We Cannot See: 541 次阅读
The Book Thief: 537 次阅读
The Guernsey Literary and Potato Peel Pie Society: 462 次阅读
The Immortal Life of Henrietta Lacks: 421 次阅读
The Nightingale: 405 次阅读
Unbroken: A World War II Story of Survival, Resilience, and Redemption: 395 次阅读
The Light Between Oceans: 393 次阅读
Bossypants: 366 次阅读
Water for Elephants: 363 次阅读

随机用户 df2578d3aee06a554fd389c55dcf435a 的前5条阅读记录:
书名: Katherine, 阅读时间: 2011-03-18 07:00:00
书名: The Bonesetter's Daughter, 阅读时间: 2011-04-06 03:29:39
书名: The Pillars of the Earth (Kingsbridge, #1), 阅读时间: 2011-04-06 03:32:22
书名: The 19th Wife, 阅读时间: 2011-04-06 03:32:54
书名: The Russian Concubine (The Russian Concubine, #1), 阅读时间: 2011-04-07 00:46:26


In [38]:
import json

def load_data(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

def remap_ids(data):
    user_id_map = {}
    book_title_map = {}  # 使用书名作为键
    new_user_id = 0
    new_book_id = 0
    
    remapped_data = {}
    for user, books in data.items():
        if user not in user_id_map:
            user_id_map[user] = new_user_id
            new_user_id += 1
        
        new_books = []
        for book in books:
            title = book['title']
            # 如果书名没有映射过，分配新的ID
            if title not in book_title_map:
                book_title_map[title] = new_book_id
                new_book_id += 1
            
            # 替换书籍ID
            new_books.append({
                'book_id': book_title_map[title],
                'title': title,
                'read_at': book['read_at']
            })
        
        remapped_data[user_id_map[user]] = new_books

    # 调试信息
    print("\n调试信息：前5个用户和其书籍映射结果")
    for i, (user, books) in enumerate(remapped_data.items()):
        if i >= 5:
            break
        print(f"新用户ID: {user}, 阅读记录数: {len(books)}")
        for book in books[:5]:
            print(f"  新书籍ID: {book['book_id']}, 书名: {book['title']}, 阅读时间: {book['read_at']}")
    
    return remapped_data, user_id_map, book_title_map

def generate_id2name(book_title_map, output_file):
    sorted_books = sorted(book_title_map.items(), key=lambda x: x[1])
    with open(output_file, 'w', encoding='utf-8') as f:
        for title, new_id in sorted_books:
            f.write(f"{new_id}:: {title}\n")

# 主流程
file_path = '/workspace/history/processed_ba_user_sessions.json'
data = load_data(file_path)

remapped_data, user_id_map, book_title_map = remap_ids(data)

with open('/workspace/history/remapped_ya_user_sessions.json', 'w') as f:
    json.dump(remapped_data, f, indent=2)

generate_id2name(book_title_map, '/workspace/history/id2name.txt')

with open('/workspace/history/user_id_map.json', 'w') as f:
    json.dump(user_id_map, f, indent=2)

# 调试信息
print("\n调试信息：前5个用户ID映射")
for i, (old_id, new_id) in enumerate(user_id_map.items()):
    if i >= 5:
        break
    print(f"旧用户ID: {old_id}, 新用户ID: {new_id}")

print("\n调试信息：前5个书籍标题映射")
for i, (title, new_id) in enumerate(book_title_map.items()):
    if i >= 5:
        break
    print(f"书名: {title}, 新书籍ID: {new_id}")

print("处理完成，用户和书籍ID已更新，id2name.txt 文件已生成。")


调试信息：前5个用户和其书籍映射结果
新用户ID: 0, 阅读记录数: 90
  新书籍ID: 0, 书名: Affinity, 阅读时间: 1341371308.0
  新书籍ID: 1, 书名: Elizabeth Street, 阅读时间: 1342162800.0
  新书籍ID: 2, 书名: A Spy in the House (The Agency, #1), 阅读时间: 1351580400.0
  新书籍ID: 3, 书名: The Body at the Tower (The Agency, #2), 阅读时间: 1351839600.0
  新书籍ID: 4, 书名: The Traitor in the Tunnel (The Agency, #3), 阅读时间: 1352012400.0
新用户ID: 1, 阅读记录数: 55
  新书籍ID: 90, 书名: Atonement, 阅读时间: 1248850800.0
  新书籍ID: 91, 书名: Roll of Thunder, Hear My Cry (Logans, #4), 阅读时间: 1251788400.0
  新书籍ID: 92, 书名: Out of the Dust, 阅读时间: 1259740800.0
  新书籍ID: 93, 书名: The Guernsey Literary and Potato Peel Pie Society, 阅读时间: 1273042800.0
  新书籍ID: 94, 书名: Prayers for Sale, 阅读时间: 1273906800.0
新用户ID: 2, 阅读记录数: 60
  新书籍ID: 141, 书名: The Wedding (Lairds' Fiancées, #2), 阅读时间: 874306800.0
  新书籍ID: 142, 书名: sTORI Telling, 阅读时间: 1239260400.0
  新书籍ID: 143, 书名: A Reliable Wife, 阅读时间: 1246690800.0
  新书籍ID: 144, 书名: The Seance, 阅读时间: 1247554800.0
  新书籍ID: 145, 书名: To Tame a Highland Warrior (Hig

In [21]:
# import json

# # 读取数据
# def load_data(file_path):
#     with open(file_path, 'r') as f:
#         return json.load(f)

# # 重新映射用户ID和书籍ID
# def remap_ids(data):
#     user_id_map = {}
#     book_id_map = {}
#     new_user_id = 0
#     new_book_id = 0
    
#     remapped_data = {}

#     for user, books in data.items():
#         # 如果用户ID没有映射过，分配新的ID
#         if user not in user_id_map:
#             user_id_map[user] = new_user_id
#             new_user_id += 1
        
#         new_books = []
#         for book in books:
#             book_id = book['book_id']
#             title = book['title']
#             # 如果书籍ID没有映射过，分配新的ID
#             if book_id not in book_id_map:
#                 book_id_map[book_id] = (new_book_id, title)  # 保存book_id和title
#                 new_book_id += 1
            
#             # 替换书籍ID
#             new_books.append({
#                 'book_id': book_id_map[book_id][0],  # 使用新ID
#                 'title': title,
#                 'read_at': book['read_at']
#             })
        
#         # 使用新的用户ID和书籍ID
#         remapped_data[user_id_map[user]] = new_books

#     # 调试：打印前5个用户和书籍映射
#     print("\n调试信息：前5个用户和其书籍映射结果")
#     for i, (user, books) in enumerate(remapped_data.items()):
#         if i >= 5:
#             break
#         print(f"新用户ID: {user}, 阅读记录数: {len(books)}")
#         for book in books[:5]:  # 打印前5条阅读记录
#             print(f"  新书籍ID: {book['book_id']}, 书名: {book['title']}, 阅读时间: {book['read_at']}")
    
#     return remapped_data, user_id_map, book_id_map

# # 生成 id2name.txt 文件
# def generate_id2name(book_id_map, output_file):
#     # 按书籍ID排序
#     sorted_books = sorted(book_id_map.items(), key=lambda x: x[1][0])  # 按照新的ID排序
#     with open(output_file, 'w') as f:
#         for book_id, (new_id, title) in sorted_books:
#             f.write(f"{new_id}:: {title}\n")

# # 主流程
# file_path = '/workspace/goodreads/processed_ba_user_sessions.json'
# data = load_data(file_path)

# # 重新映射用户ID和书籍ID
# remapped_data, user_id_map, book_id_map = remap_ids(data)

# # 将处理后的数据保存回 JSON 文件
# with open('/workspace/goodreads/remapped_ya_user_sessions.json', 'w') as f:
#     json.dump(remapped_data, f, indent=2)

# # 生成 id2name.txt 文件
# generate_id2name(book_id_map, '/workspace/goodreads/id2name.txt')

# # 保存用户ID映射表
# with open('/workspace/goodreads/user_id_map.json', 'w') as f:
#     json.dump(user_id_map, f, indent=2)

# # 调试：检查映射表
# print("\n调试信息：前5个用户ID映射")
# for i, (old_id, new_id) in enumerate(user_id_map.items()):
#     if i >= 5:
#         break
#     print(f"旧用户ID: {old_id}, 新用户ID: {new_id}")

# print("\n调试信息：前5个书籍ID映射")
# for i, (old_id, (new_id, title)) in enumerate(book_id_map.items()):
#     if i >= 5:
#         break
#     print(f"旧书籍ID: {old_id}, 新书籍ID: {new_id}, 书名: {title}")

# print("处理完成，用户和书籍ID已更新，id2name.txt 文件已生成。")



调试信息：前5个用户和其书籍映射结果
新用户ID: 0, 阅读记录数: 69
  新书籍ID: 0, 书名: Affinity, 阅读时间: 1341371308.0
  新书籍ID: 1, 书名: A Spy in the House (The Agency, #1), 阅读时间: 1351580400.0
  新书籍ID: 2, 书名: The Body at the Tower (The Agency, #2), 阅读时间: 1351839600.0
  新书籍ID: 3, 书名: The Cater Street Hangman (Charlotte & Thomas Pitt, #1), 阅读时间: 1360170255.0
  新书籍ID: 4, 书名: The Apothecary's Daughter, 阅读时间: 1360742400.0
新用户ID: 1, 阅读记录数: 50
  新书籍ID: 69, 书名: Atonement, 阅读时间: 1248850800.0
  新书籍ID: 70, 书名: Roll of Thunder, Hear My Cry (Logans, #4), 阅读时间: 1251788400.0
  新书籍ID: 71, 书名: Out of the Dust, 阅读时间: 1259740800.0
  新书籍ID: 72, 书名: The Guernsey Literary and Potato Peel Pie Society, 阅读时间: 1273042800.0
  新书籍ID: 73, 书名: Prayers for Sale, 阅读时间: 1273906800.0
新用户ID: 2, 阅读记录数: 49
  新书籍ID: 116, 书名: The Wedding (Lairds' Fiancées, #2), 阅读时间: 874306800.0
  新书籍ID: 117, 书名: sTORI Telling, 阅读时间: 1239260400.0
  新书籍ID: 118, 书名: A Reliable Wife, 阅读时间: 1246690800.0
  新书籍ID: 119, 书名: To Tame a Highland Warrior (Highlander, #2), 阅读时间: 12567132

In [40]:
import json
import pandas as pd
import numpy as np

# 读取数据
def load_data(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

# 填充历史序列的函数，将 pad_item 填充到序列末尾
def pad_history(itemlist, length, pad_item):
    if len(itemlist) >= length:
        return itemlist[-length:]
    else:
        return itemlist + [pad_item] * (length - len(itemlist))

# 生成训练集的函数，并过滤掉 len_seq = 0 的记录
def generate_train_sequences(data, length=10, pad_item=2359):
    state, len_state, action = [], [], []
    
    for user_id, books in data.items():
        history = []
        for index, book in enumerate(books):
            s = list(history)  # 复制当前的历史记录
            if len(history) > 0:  # 只生成有效的历史序列
                len_state.append(len(s) if len(s) < length else length)  # 保存历史序列的长度
                s = pad_history(s, length, pad_item)  # 填充或截取历史序列

                state.append(s)
                action.append(book['book_id'])  # 预测的下一本书

            # 更新历史记录
            history.append(book['book_id'])
    
    # 创建 DataFrame 并确保索引从 0 开始
    train_df = pd.DataFrame({'seq': state, 'len_seq': len_state, 'next': action})
    train_df.reset_index(drop=True, inplace=True)

    # 打印一些调试信息
    print(f"训练集生成完成，总记录数: {len(train_df)}")
    print(f"前5条训练集记录: \n{train_df.head()}")

    return train_df

# # 生成训练集的函数，并过滤掉 len_seq = 0 的记录，且打乱顺序
# def generate_train_sequences(data, length=10, pad_item=2359):
#     state, len_state, action = [], [], []
    
#     for user_id, books in data.items():
#         history = []
#         for index, book in enumerate(books):
#             s = list(history)  # 复制当前的历史记录
#             if len(history) > 0:  # 只生成有效的历史序列
#                 len_state.append(len(s) if len(s) < length else length)  # 保存历史序列的长度
#                 s = pad_history(s, length, pad_item)  # 填充或截取历史序列

#                 state.append(s)
#                 action.append(book['book_id'])  # 预测的下一本书

#             # 更新历史记录
#             history.append(book['book_id'])
    
#     # 创建 DataFrame 并确保索引从 0 开始
#     train_df = pd.DataFrame({'seq': state, 'len_seq': len_state, 'next': action})
#     train_df.reset_index(drop=True, inplace=True)
    
#     # 打乱数据
#     train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)

#     # 打印一些调试信息
#     print(f"训练集生成完成，总记录数: {len(train_df)}")
#     print(f"前5条训练集记录: \n{train_df.head()}")

#     return train_df

# 生成验证集和测试集的函数，并过滤掉 len_seq = 0 的记录
def generate_test_sequences(data, length=10, pad_item=2359):
    state, len_state, action = [], [], []
    
    for user_id, books in data.items():
        history = [book['book_id'] for book in books]
        
        if len(history) > 1:
            s = history[:-1]  # 最后一条作为预测目标，之前的作为历史记录
        else:
            s = []

        if len(s) > 0:  # 只生成有效的历史序列
            len_state.append(len(s) if len(s) < length else length)  # 保存历史序列的长度
            s = pad_history(s, length, pad_item)  # 填充或截取历史序列

            state.append(s)
            action.append(history[-1])  # 最后一条作为预测目标
    
    # 创建 DataFrame 并确保索引从 0 开始
    test_df = pd.DataFrame({'seq': state, 'len_seq': len_state, 'next': action})
    test_df.reset_index(drop=True, inplace=True)

    # 打印一些调试信息
    print(f"验证/测试集生成完成，总记录数: {len(test_df)}")
    print(f"前5条验证/测试集记录: \n{test_df.head()}")

    return test_df

# 检查数据集
def check_data(df, name):
    print(f"{name} 集合长度: {len(df)}")
    print(f"{name} 集合中前 3 条记录:\n{df.head(3)}")
    print(f"{name} 中 len_seq 的最小值: {df['len_seq'].min()}")
    assert df['len_seq'].min() > 0, f"{name} 集合中有 len_seq = 0 的记录！"

# 主流程
file_path = '/workspace/history/remapped_ya_user_sessions.json'
data = load_data(file_path)

# 假设用户按 8:1:1 划分为训练、验证和测试集
total_users = list(data.keys())
fractions = [0.8, 0.1, 0.1]
train_users, val_users, test_users = np.split(total_users, [int(0.8*len(total_users)), int(0.9*len(total_users))])

# 根据用户划分生成训练集、验证集和测试集
train_data = {user: data[user] for user in train_users}
val_data = {user: data[user] for user in val_users}
test_data = {user: data[user] for user in test_users}

# 生成并保存训练集
train_df = generate_train_sequences(train_data)
train_df.to_pickle('/workspace/history/train_data.df')

# 生成并保存验证集
val_df = generate_test_sequences(val_data)
val_df.to_pickle('/workspace/history/val_data.df')

# 生成并保存测试集
test_df = generate_test_sequences(test_data)
test_df.to_pickle('/workspace/history/test_data.df')

# 检查生成的 DataFrame
check_data(train_df, "训练集")
check_data(val_df, "验证集")
check_data(test_df, "测试集")

print("数据集生成并保存完成。")


训练集生成完成，总记录数: 72697
前5条训练集记录: 
                                                 seq  len_seq  next
0  [1666, 895, 728, 283, 1799, 263, 1237, 1111, 5...       10  1394
1  [375, 1362, 2009, 561, 1560, 1595, 1378, 1663,...       10   239
2  [1484, 1318, 319, 736, 704, 99, 1019, 913, 136...       10  1988
3  [979, 25, 29, 1739, 2359, 2359, 2359, 2359, 23...        4   658
4  [1049, 1172, 907, 1061, 1594, 1033, 151, 489, ...       10   172
验证/测试集生成完成，总记录数: 112
前5条验证/测试集记录: 
                                                 seq  len_seq  next
0  [184, 135, 1432, 1475, 1120, 478, 1286, 193, 2...       10  1434
1  [932, 2056, 2101, 1479, 1643, 1477, 93, 1668, ...       10    73
2  [59, 224, 32, 643, 1851, 506, 174, 518, 807, 519]       10  1801
3  [2172, 799, 800, 806, 1130, 808, 1784, 1185, 1...       10    83
4  [475, 128, 530, 59, 1135, 607, 454, 951, 136, ...       10   224
验证/测试集生成完成，总记录数: 112
前5条验证/测试集记录: 
                                                 seq  len_seq  next
0  [2137, 208, 

In [39]:
import re

def clean_id2name_remove_brackets(file_path, output_path):
    cleaned_data = []
    
    # 定义一个正则表达式，匹配括号及其内部的内容
    pattern = r'\s*\(.*?\)'

    # 读取并清洗数据
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('::')
            if len(parts) == 2:  # 确保有 id 和 book_name 两部分
                book_id, book_name = parts
                if book_name:  # 确保书名不为空
                    # 使用正则表达式去掉括号和其中的内容
                    book_name_cleaned = re.sub(pattern, '', book_name).strip()
                    cleaned_data.append(f"{book_id}::{book_name_cleaned}")
    
    # 保存清洗后的数据
    with open(output_path, 'w', encoding='utf-8') as f:
        for line in cleaned_data:
            f.write(f"{line}\n")
    
    print(f"清洗完成，已保存到 {output_path}")
    print(f"清洗后条目数: {len(cleaned_data)}")

# 调用函数进行清洗
file_path = '/workspace/history/id2name.txt'
output_path = '/workspace/history/id2name_clean.txt'

clean_id2name_remove_brackets(file_path, output_path)


清洗完成，已保存到 /workspace/history/id2name_clean.txt
清洗后条目数: 2359


In [41]:
import re
import pandas as pd

# 文件路径
file_path = '/workspace/history/id2name_clean.txt'

# 读取文件内容
with open(file_path, 'r', encoding='utf-8') as f:
    titles = f.readlines()

# 去除每行末尾的换行符
titles = [title.strip() for title in titles]

# 定义清洗标题的函数，去除第一个冒号后的内容
def clean_title(title):
    # 分割 '::' 以区分ID和标题
    parts = title.split("::")
    book_id = parts[0]
    # 清除标题中的多余部分
    cleaned_title = re.sub(r':.*', '', parts[1]).strip()
    return f"{book_id}::{cleaned_title}"

# 清洗所有标题
cleaned_titles = [clean_title(title) for title in titles]

# 转换为 DataFrame
cleaned_titles_df = pd.DataFrame(cleaned_titles, columns=["Cleaned Titles"])

# 保存清洗后的结果到新文件
output_file_path = '/workspace/history/id2name.txt'
with open(output_file_path, 'w', encoding='utf-8') as f:
    f.write('\n'.join(cleaned_titles))

# 显示清洗后的 DataFrame
cleaned_titles_df


Unnamed: 0,Cleaned Titles
0,0::Affinity
1,1::Elizabeth Street
2,2::A Spy in the House
3,3::The Body at the Tower
4,4::The Traitor in the Tunnel
...,...
2354,2354::Nikola Tesla
2355,2355::When Nietzsche Wept
2356,2356::This Blinding Absence of Light
2357,2357::Assata
