In [1]:
import json
import gzip
from collections import defaultdict

# 读取 Goodreads 子类数据
def load_reviews(file_path):
    reviews = []
    with gzip.open(file_path, 'rt') as f:
        for line in f:
            review = json.loads(line)
            reviews.append(review)
    return reviews

# 统计基本信息
def analyze_reviews(reviews):
    user_count = defaultdict(int)
    book_count = defaultdict(int)
    
    # 统计每个用户的评论次数和每本书的评论次数
    for review in reviews:
        user_id = review['user_id']
        book_id = review['book_id']
        user_count[user_id] += 1
        book_count[book_id] += 1
    
    total_reviews = len(reviews)
    total_users = len(user_count)
    total_books = len(book_count)
    
    print(f"总评论数: {total_reviews}")
    print(f"总用户数: {total_users}")
    print(f"总书籍数: {total_books}")
    
    # 分析评论次数分布
    user_review_counts = list(user_count.values())
    book_review_counts = list(book_count.values())
    
    print(f"\n用户评论数分析:")
    print(f"平均每个用户的评论数: {sum(user_review_counts) / len(user_review_counts):.2f}")
    print(f"最少评论数: {min(user_review_counts)}, 最多评论数: {max(user_review_counts)}")
    
    print(f"\n书籍评论数分析:")
    print(f"平均每本书的评论数: {sum(book_review_counts) / len(book_review_counts):.2f}")
    print(f"最少评论数: {min(book_review_counts)}, 最多评论数: {max(book_review_counts)}")

# 主流程
file_path = '/workspace/goodreads/goodreads_reviews_young_adult.json.gz'

print("加载数据中...")
reviews = load_reviews(file_path)

print("分析数据中...")
analyze_reviews(reviews)


加载数据中...
分析数据中...
总评论数: 2389900
总用户数: 209152
总书籍数: 93267

用户评论数分析:
平均每个用户的评论数: 11.43
最少评论数: 1, 最多评论数: 2438

书籍评论数分析:
平均每本书的评论数: 25.62
最少评论数: 1, 最多评论数: 20756


In [4]:
import json
import gzip

# 读取 Goodreads 子类数据的一个 JSON 对象
def print_first_review(file_path):
    with gzip.open(file_path, 'rt') as f:
        first_line = f.readline()  # 读取第一行数据
        first_review = json.loads(first_line)  # 将其转为 JSON 格式
        print(json.dumps(first_review, indent=2))  # 格式化输出 JSON 对象

# 主流程
file_path = '/workspace/goodreads/goodreads_reviews_young_adult.json.gz'

# 打印第一个 JSON 对象
print("打印第一个 JSON 对象中的特征:")
print_first_review(file_path)


打印第一个 JSON 对象中的特征:
{
  "user_id": "8842281e1d1347389f2ab93d60773d4d",
  "book_id": "2767052",
  "review_id": "248c011811e945eca861b5c31a549291",
  "rating": 5,
  "review_text": "I cracked and finally picked this up. Very enjoyable quick read - couldn't put it down - it was like crack. \n I'm a bit bothered by the lack of backstory of how Panem and the Hunger Games come about. It is just kind of explained away in a few paragraphs and we are left to accept this very strange world where teenagers are pitted into an arena each year to kill each other? I was expecting it because I've seen Battle Royale, but I would have appreciated knowing more of the backstory of how the world could have come into such a odd state. \n I suppose what makes a book like this interesting is thinking about the strategy of it all. The players are going to be statistically encouraged to band together because they will last longer that way, but by definition of course any partnership will be broken, and the drama 

In [2]:
import json
import gzip
from collections import defaultdict

# 读取 Goodreads 子类数据
def load_reviews(file_path):
    reviews = []
    with gzip.open(file_path, 'rt') as f:
        for line in f:
            review = json.loads(line)
            reviews.append(review)
    return reviews

# 选出最受欢迎的前4000本书
def select_top_books(reviews, top_n=4000):
    book_count = defaultdict(int)
    
    # 统计每本书的评论次数
    for review in reviews:
        book_id = review['book_id']
        book_count[book_id] += 1
    
    # 按评论次数排序，选出前 top_n 本书
    top_books = sorted(book_count.items(), key=lambda x: x[1], reverse=True)[:top_n]
    top_book_ids = set([book_id for book_id, count in top_books])
    
    return top_books, top_book_ids

# 统计每个用户在前4000本书中的阅读情况
def analyze_user_reading(reviews, top_book_ids):
    user_top_book_count = defaultdict(int)
    
    # 按用户统计其阅读的前4000本书的数量
    for review in reviews:
        user_id = review['user_id']
        book_id = review['book_id']
        if book_id in top_book_ids:
            user_top_book_count[user_id] += 1
    
    return user_top_book_count

# 主流程
file_path = '/workspace/goodreads/goodreads_reviews_young_adult.json.gz'

print("加载数据中...")
reviews = load_reviews(file_path)

# 选出最受欢迎的前4000本书
top_books, top_book_ids = select_top_books(reviews, top_n=4000)

# 输出最受欢迎的书籍及其评论次数
print(f"最受欢迎的 4000 本书的评论次数分布:")
for book_id, count in top_books[:10]:  # 仅输出前10本书的评论数
    print(f"书籍ID: {book_id}, 评论次数: {count}")

# 统计每个用户在这 4000 本书中阅读的数量
user_top_book_count = analyze_user_reading(reviews, top_book_ids)

# 输出部分用户阅读情况
print(f"\n用户在前4000本书中的阅读情况:")
for user_id, count in list(user_top_book_count.items())[:10]:  # 仅输出前10个用户
    print(f"用户ID: {user_id}, 阅读涉及的4000本书中的数量: {count}")


加载数据中...
最受欢迎的 4000 本书的评论次数分布:
书籍ID: 11870085, 评论次数: 20756
书籍ID: 2767052, 评论次数: 18617
书籍ID: 7260188, 评论次数: 13536
书籍ID: 6148028, 评论次数: 11904
书籍ID: 13335037, 评论次数: 10743
书籍ID: 41865, 评论次数: 10535
书籍ID: 15745753, 评论次数: 9590
书籍ID: 11235712, 评论次数: 9585
书籍ID: 9460487, 评论次数: 9557
书籍ID: 11735983, 评论次数: 9207

用户在前4000本书中的阅读情况:
用户ID: 8842281e1d1347389f2ab93d60773d4d, 阅读涉及的4000本书中的数量: 1
用户ID: 7504b2aee1ecb5b2872d3da381c6c91e, 阅读涉及的4000本书中的数量: 1
用户ID: f8a89075dc6de14857561522e729f82c, 阅读涉及的4000本书中的数量: 1
用户ID: 704eb93a316aff687a93d5215882eb21, 阅读涉及的4000本书中的数量: 3
用户ID: 012515e5802b2e0f42915118c90fa04b, 阅读涉及的4000本书中的数量: 31
用户ID: f4d16ea4ac59af59d257631398af39f4, 阅读涉及的4000本书中的数量: 3
用户ID: 01ec1a320ffded6b2dd47833f2c8e4fb, 阅读涉及的4000本书中的数量: 19
用户ID: 4b3636a043e5c99fa27ac897ccfa1151, 阅读涉及的4000本书中的数量: 8
用户ID: 903d4b859e86a1dd6d7640849cc7067c, 阅读涉及的4000本书中的数量: 1
用户ID: afc070543f19028dc7e7f084a0079f72, 阅读涉及的4000本书中的数量: 2


In [3]:
# 按照用户阅读的热门书籍数量排序，并选出前 1000 个用户
def get_top_n_users(user_top_book_count, top_n=1000):
    sorted_users = sorted(user_top_book_count.items(), key=lambda x: x[1], reverse=True)  # 按照阅读量从大到小排序
    top_users = sorted_users[:top_n]  # 选出前 top_n 个用户
    return top_users

# 显示读最多和读最少的用户的阅读量
def display_min_max_user_reading(top_users):
    max_read_user = top_users[0]
    min_read_user = top_users[-1]
    
    print(f"读最多书的用户ID: {max_read_user[0]}, 阅读的4000本热门书中的数量: {max_read_user[1]}")
    print(f"读最少书的用户ID: {min_read_user[0]}, 阅读的4000本热门书中的数量: {min_read_user[1]}")

# 筛选出前 1000 个用户
top_n = 1000
top_users = get_top_n_users(user_top_book_count, top_n=top_n)

# 显示读最多书和读最少书的用户的阅读量
display_min_max_user_reading(top_users)

# 输出前1000个用户的阅读情况
print(f"\n前 {top_n} 用户在4000本热门书中的阅读情况:")
for user_id, count in top_users[:10]:  # 仅显示前10个用户的阅读情况
    print(f"用户ID: {user_id}, 阅读涉及的4000本书中的数量: {count}")


读最多书的用户ID: aca760854b57ce2ec981df32e46dc96c, 阅读的4000本热门书中的数量: 1126
读最少书的用户ID: 7b4166acdefeb4adb7d12d9c1645c48f, 阅读的4000本热门书中的数量: 142

前 1000 用户在4000本热门书中的阅读情况:
用户ID: aca760854b57ce2ec981df32e46dc96c, 阅读涉及的4000本书中的数量: 1126
用户ID: 288dc8c9871098c8a1b680db829275b4, 阅读涉及的4000本书中的数量: 817
用户ID: aed35dbc626957174ebedf3c555b63d0, 阅读涉及的4000本书中的数量: 770
用户ID: d321b1bcf294bca33510816afd898eb3, 阅读涉及的4000本书中的数量: 748
用户ID: af9864c9e69963abb963fe2c90dd6f09, 阅读涉及的4000本书中的数量: 739
用户ID: 63eb5a9ea6fbce905e96dadf97e60c93, 阅读涉及的4000本书中的数量: 685
用户ID: 19ff136f47089904d689e69e36c991d0, 阅读涉及的4000本书中的数量: 684
用户ID: 667b94d4c7e0b014bb6ab3636999e712, 阅读涉及的4000本书中的数量: 658
用户ID: 0d344a261de9ab42e62cf9b3b7c52cc4, 阅读涉及的4000本书中的数量: 649
用户ID: 884719ebf7dbd2977768e179358f6758, 阅读涉及的4000本书中的数量: 646


In [16]:
import json
import gzip
from collections import Counter, defaultdict
from datetime import datetime
import heapq

def parse_date(date_string):
    if not date_string:
        return None
    try:
        return datetime.strptime(date_string, "%a %b %d %H:%M:%S %z %Y").timestamp()
    except ValueError:
        return None

def get_top_books(file_path, top_n=4000):
    book_counts = Counter()
    with gzip.open(file_path, 'rt') as f:
        for line in f:
            review = json.loads(line)
            book_counts[review['book_id']] += 1
    return dict(book_counts.most_common(top_n))

def get_user_reading_counts(file_path, top_books):
    user_reading_counts = defaultdict(int)
    with gzip.open(file_path, 'rt') as f:
        for line in f:
            review = json.loads(line)
            if review['book_id'] in top_books:
                user_reading_counts[review['user_id']] += 1
    return user_reading_counts

def get_top_users(user_reading_counts, top_n=1000):
    return dict(sorted(user_reading_counts.items(), key=lambda x: x[1], reverse=True)[:top_n])

def load_book_titles(file_path):
    book_titles = {}
    with gzip.open(file_path, 'rt') as f:
        for line in f:
            book = json.loads(line)
            book_titles[book['book_id']] = book['title']
    return book_titles

def process_and_save_data(reviews_file, books_file, output_file, top_books, top_users):
    book_titles = load_book_titles(books_file)
    user_data = defaultdict(list)
    
    with gzip.open(reviews_file, 'rt') as f:
        for line in f:
            review = json.loads(line)
            user_id = review['user_id']
            book_id = review['book_id']
            
            if user_id in top_users and book_id in top_books:
                read_at = parse_date(review.get('read_at')) or parse_date(review.get('date_added'))
                title = book_titles.get(book_id, "Unknown Title")
                user_data[user_id].append({
                    'book_id': book_id,
                    'title': title,
                    'read_at': read_at
                })
    
    # Sort each user's reading list by date
    for user_id, books in user_data.items():
        user_data[user_id] = sorted(books, key=lambda x: x['read_at'] or 0)
    
    with open(output_file, 'w') as f:
        json.dump(user_data, f)

# Main processing
reviews_file = '/workspace/goodreads/goodreads_reviews_young_adult.json.gz'
books_file = '/workspace/goodreads/goodreads_books.json.gz'
output_file = 'processed_ya_user_sessions.json'

print("Identifying top 4000 books...")
top_books = get_top_books(reviews_file)

print("Analyzing user reading patterns...")
user_reading_counts = get_user_reading_counts(reviews_file, top_books)

print("Selecting top 1000 users...")
top_users = get_top_users(user_reading_counts)

print("Processing and saving data...")
process_and_save_data(reviews_file, books_file, output_file, top_books, top_users)

# Print statistics
print(f"\nTop 4000 books:")
print(f"Most reviewed book: {max(top_books.values())} reviews")
print(f"Least reviewed book among top 4000: {min(top_books.values())} reviews")

print(f"\nTop 1000 users:")
print(f"User with most books read: {max(top_users.values())} books")
print(f"User with least books read among top 1000: {min(top_users.values())} books")

print(f"\nProcessed data saved to {output_file}")

Identifying top 4000 books...
Analyzing user reading patterns...
Selecting top 1000 users...
Processing and saving data...

Top 4000 books:
Most reviewed book: 20756 reviews
Least reviewed book among top 4000: 82 reviews

Top 1000 users:
User with most books read: 1126 books
User with least books read among top 1000: 142 books

Processed data saved to processed_ya_user_sessions.json


In [17]:
import json
import random
from collections import Counter
from datetime import datetime

def load_data(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

def analyze_data(data):
    user_book_counts = {user: len(books) for user, books in data.items()}
    reading_spans = {}
    books_counter = Counter()

    for user, books in data.items():
        if books:
            dates = [book['read_at'] for book in books if book['read_at']]
            if dates:
                reading_spans[user] = max(dates) - min(dates)
        books_counter.update([book['title'] for book in books])

    return user_book_counts, reading_spans, books_counter

def print_statistics(user_book_counts, reading_spans, books_counter):
    print(f"用户数: {len(user_book_counts)}")
    print(f"平均阅读量: {sum(user_book_counts.values()) / len(user_book_counts):.2f}")
    print(f"中位数阅读量: {sorted(user_book_counts.values())[len(user_book_counts)//2]}")
    print(f"最小阅读量: {min(user_book_counts.values())}")
    print(f"最大阅读量: {max(user_book_counts.values())}")

    avg_span = sum(reading_spans.values()) / len(reading_spans) / (24 * 3600)  # 转换为天
    print(f"\n平均阅读时间跨度: {avg_span:.2f} 天")

    print("\n最受欢迎的10本书:")
    for book, count in books_counter.most_common(10):
        print(f"{book}: {count} 次阅读")

    # 随机选择一个用户进行样本检查
    sample_user = random.choice(list(data.keys()))
    print(f"\n随机用户 {sample_user} 的前5条阅读记录:")
    for book in data[sample_user][:5]:
        print(f"书名: {book['title']}, 阅读时间: {datetime.fromtimestamp(book['read_at'])}")

# 主处理流程
file_path = 'processed_ya_user_sessions.json'
data = load_data(file_path)
user_book_counts, reading_spans, books_counter = analyze_data(data)
print_statistics(user_book_counts, reading_spans, books_counter)

用户数: 1000
平均阅读量: 220.56
中位数阅读量: 188
最小阅读量: 142
最大阅读量: 1126

平均阅读时间跨度: 2607.09 天

最受欢迎的10本书:
Cinder (The Lunar Chronicles, #1): 661 次阅读
Divergent (Divergent, #1): 654 次阅读
The Fault in Our Stars: 603 次阅读
The Raven Boys (The Raven Cycle, #1): 558 次阅读
Throne of Glass (Throne of Glass, #1): 556 次阅读
Scarlet (The Lunar Chronicles, #2): 546 次阅读
Insurgent (Divergent, #2): 535 次阅读
Anna and the French Kiss (Anna and the French Kiss, #1): 531 次阅读
Fangirl: 525 次阅读
Shadow and Bone (The Grisha, #1): 523 次阅读

随机用户 711bafffa3a80f5829c55a47360c7864 的前5条阅读记录:
书名: Hexbound (Dark Elite, #2), 阅读时间: 2011-05-04 07:00:00
书名: The DUFF: Designated Ugly Fat Friend, 阅读时间: 2011-05-10 07:00:00
书名: The Summer I Turned Pretty (Summer, #1), 阅读时间: 2011-05-12 07:00:00
书名: Possess, 阅读时间: 2011-11-04 07:00:00
书名: Silence (Hush, Hush, #3), 阅读时间: 2011-11-17 08:00:00


In [24]:
import json
import random
from collections import Counter
from datetime import datetime

def load_data(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

def analyze_data(data):
    user_book_counts = {user: len(books) for user, books in data.items()}
    reading_spans = {}
    books_counter = Counter()
    unique_books = set()

    for user, books in data.items():
        if books:
            dates = [book['read_at'] for book in books if book['read_at']]
            if dates:
                reading_spans[user] = max(dates) - min(dates)
        
        for book in books:
            books_counter.update([book['title']])
            unique_books.add(book['book_id'])  # 将 book_id 加入唯一书籍集合

    return user_book_counts, reading_spans, books_counter, unique_books

def print_statistics(user_book_counts, reading_spans, books_counter, unique_books):
    print(f"用户数: {len(user_book_counts)}")
    print(f"平均阅读量: {sum(user_book_counts.values()) / len(user_book_counts):.2f}")
    print(f"中位数阅读量: {sorted(user_book_counts.values())[len(user_book_counts)//2]}")
    print(f"最小阅读量: {min(user_book_counts.values())}")
    print(f"最大阅读量: {max(user_book_counts.values())}")

    avg_span = sum(reading_spans.values()) / len(reading_spans) / (24 * 3600)  # 转换为天
    print(f"\n平均阅读时间跨度: {avg_span:.2f} 天")

    print(f"\nunique book 总数: {len(unique_books)}")  # 打印 unique book 数量

    print("\n最受欢迎的10本书:")
    for book, count in books_counter.most_common(10):
        print(f"{book}: {count} 次阅读")

    # 随机选择一个用户进行样本检查
    sample_user = random.choice(list(data.keys()))
    print(f"\n随机用户 {sample_user} 的前5条阅读记录:")
    for book in data[sample_user][:5]:
        print(f"书名: {book['title']}, 阅读时间: {datetime.fromtimestamp(book['read_at'])}")

# 主处理流程
file_path = 'processed_ya_user_sessions.json'
data = load_data(file_path)
user_book_counts, reading_spans, books_counter, unique_books = analyze_data(data)
print_statistics(user_book_counts, reading_spans, books_counter, unique_books)


用户数: 1000
平均阅读量: 220.56
中位数阅读量: 188
最小阅读量: 142
最大阅读量: 1126

平均阅读时间跨度: 2607.09 天

unique book 总数: 3934

最受欢迎的10本书:
Cinder (The Lunar Chronicles, #1): 661 次阅读
Divergent (Divergent, #1): 654 次阅读
The Fault in Our Stars: 603 次阅读
The Raven Boys (The Raven Cycle, #1): 558 次阅读
Throne of Glass (Throne of Glass, #1): 556 次阅读
Scarlet (The Lunar Chronicles, #2): 546 次阅读
Insurgent (Divergent, #2): 535 次阅读
Anna and the French Kiss (Anna and the French Kiss, #1): 531 次阅读
Fangirl: 525 次阅读
Shadow and Bone (The Grisha, #1): 523 次阅读

随机用户 ffa5094acb2bca8fc8655538e60c400e 的前5条阅读记录:
书名: Dreamhunter (The Dreamhunter Duet, #1), 阅读时间: 2009-01-01 08:00:00
书名: Magic Under Glass (Magic Under, #1), 阅读时间: 2010-05-01 07:00:00
书名: Matched (Matched, #1), 阅读时间: 2010-07-25 07:00:00
书名: Clockwork Angel (The Infernal Devices, #1), 阅读时间: 2010-07-28 07:00:00
书名: The Eternal Ones (Eternal Ones, #1), 阅读时间: 2010-07-31 07:00:00


In [26]:
import json

# 读取数据
def load_data(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

# 重新映射用户ID和书籍ID
def remap_ids(data):
    user_id_map = {}
    book_id_map = {}
    new_user_id = 0
    new_book_id = 0
    
    remapped_data = {}

    for user, books in data.items():
        # 如果用户ID没有映射过，分配新的ID
        if user not in user_id_map:
            user_id_map[user] = new_user_id
            new_user_id += 1
        
        new_books = []
        for book in books:
            book_id = book['book_id']
            title = book['title']
            # 如果书籍ID没有映射过，分配新的ID
            if book_id not in book_id_map:
                book_id_map[book_id] = (new_book_id, title)  # 保存book_id和title
                new_book_id += 1
            
            # 替换书籍ID
            new_books.append({
                'book_id': book_id_map[book_id][0],  # 使用新ID
                'title': title,
                'read_at': book['read_at']
            })
        
        # 使用新的用户ID和书籍ID
        remapped_data[user_id_map[user]] = new_books

    # 调试：打印前5个用户和书籍映射
    print("\n调试信息：前5个用户和其书籍映射结果")
    for i, (user, books) in enumerate(remapped_data.items()):
        if i >= 5:
            break
        print(f"新用户ID: {user}, 阅读记录数: {len(books)}")
        for book in books[:5]:  # 打印前5条阅读记录
            print(f"  新书籍ID: {book['book_id']}, 书名: {book['title']}, 阅读时间: {book['read_at']}")
    
    return remapped_data, user_id_map, book_id_map

# 生成 id2name.txt 文件
def generate_id2name(book_id_map, output_file):
    # 按书籍ID排序
    sorted_books = sorted(book_id_map.items(), key=lambda x: x[1][0])  # 按照新的ID排序
    with open(output_file, 'w') as f:
        for book_id, (new_id, title) in sorted_books:
            f.write(f"{new_id}:: {title}\n")

# 主流程
file_path = 'processed_ya_user_sessions.json'
data = load_data(file_path)

# 重新映射用户ID和书籍ID
remapped_data, user_id_map, book_id_map = remap_ids(data)

# 将处理后的数据保存回 JSON 文件
with open('remapped_ya_user_sessions.json', 'w') as f:
    json.dump(remapped_data, f, indent=2)

# 生成 id2name.txt 文件
generate_id2name(book_id_map, 'id2name.txt')

# 保存用户ID映射表
with open('user_id_map.json', 'w') as f:
    json.dump(user_id_map, f, indent=2)

# 调试：检查映射表
print("\n调试信息：前5个用户ID映射")
for i, (old_id, new_id) in enumerate(user_id_map.items()):
    if i >= 5:
        break
    print(f"旧用户ID: {old_id}, 新用户ID: {new_id}")

print("\n调试信息：前5个书籍ID映射")
for i, (old_id, (new_id, title)) in enumerate(book_id_map.items()):
    if i >= 5:
        break
    print(f"旧书籍ID: {old_id}, 新书籍ID: {new_id}, 书名: {title}")

print("处理完成，用户和书籍ID已更新，id2name.txt 文件已生成。")



调试信息：前5个用户和其书籍映射结果
新用户ID: 0, 阅读记录数: 268
  新书籍ID: 0, 书名: Matched (Matched, #1), 阅读时间: 1339225200.0
  新书籍ID: 1, 书名: Firelight (Firelight, #1), 阅读时间: 1339657200.0
  新书籍ID: 2, 书名: Thirteen Reasons Why, 阅读时间: 1340262000.0
  新书籍ID: 3, 书名: The Fault in Our Stars, 阅读时间: 1341903600.0
  新书籍ID: 4, 书名: City of Bones (The Mortal Instruments, #1), 阅读时间: 1355817600.0
新用户ID: 1, 阅读记录数: 250
  新书籍ID: 188, 书名: Vampire Academy (Vampire Academy, #1), 阅读时间: 1252631367.0
  新书籍ID: 268, 书名: After, 阅读时间: 1252738800.0
  新书籍ID: 210, 书名: Frostbite (Vampire Academy, #2), 阅读时间: 1259654400.0
  新书籍ID: 269, 书名: A Match Made in High School, 阅读时间: 1260141161.0
  新书籍ID: 270, 书名: Liar, 阅读时间: 1260141372.0
新用户ID: 2, 阅读记录数: 156
  新书籍ID: 490, 书名: The Looking Glass Wars (The Looking Glass Wars, #1), 阅读时间: 1191913200.0
  新书籍ID: 491, 书名: Seeing Redd (The Looking Glass Wars, #2), 阅读时间: 1200816000.0
  新书籍ID: 492, 书名: The Subtle Knife (His Dark Materials, #2), 阅读时间: 1204963200.0
  新书籍ID: 493, 书名: Lucinda's Secret (The Spiderwick Chr

In [38]:
import json
import pandas as pd

# 读取数据
def load_data(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

# 填充历史序列的函数，将 pad_item 填充到序列末尾
def pad_history(itemlist, length, pad_item):
    if len(itemlist) >= length:
        return itemlist[-length:]
    else:
        return itemlist + [pad_item] * (length - len(itemlist))

# # 生成训练集的函数，并过滤掉 len_seq = 0 的记录
# def generate_train_sequences(data, length=10, pad_item=3934):
#     state, len_state, action = [], [], []
    
#     for user_id, books in data.items():
#         history = []
#         for index, book in enumerate(books):
#             s = list(history)  # 复制当前的历史记录
#             if len(history) > 0:  # 只生成有效的历史序列
#                 len_state.append(len(s) if len(s) < length else length)  # 保存历史序列的长度
#                 s = pad_history(s, length, pad_item)  # 填充或截取历史序列

#                 state.append(s)
#                 action.append(book['book_id'])  # 预测的下一本书

#             # 更新历史记录
#             history.append(book['book_id'])
    
#     # 创建 DataFrame 并确保索引从 0 开始
#     train_df = pd.DataFrame({'seq': state, 'len_seq': len_state, 'next': action})
#     train_df.reset_index(drop=True, inplace=True)

#     # 打印一些调试信息
#     print(f"训练集生成完成，总记录数: {len(train_df)}")
#     print(f"前5条训练集记录: \n{train_df.head()}")

#     return train_df

# 生成训练集的函数，并过滤掉 len_seq = 0 的记录，且打乱顺序
def generate_train_sequences(data, length=10, pad_item=3934):
    state, len_state, action = [], [], []
    
    for user_id, books in data.items():
        history = []
        for index, book in enumerate(books):
            s = list(history)  # 复制当前的历史记录
            if len(history) > 0:  # 只生成有效的历史序列
                len_state.append(len(s) if len(s) < length else length)  # 保存历史序列的长度
                s = pad_history(s, length, pad_item)  # 填充或截取历史序列

                state.append(s)
                action.append(book['book_id'])  # 预测的下一本书

            # 更新历史记录
            history.append(book['book_id'])
    
    # 创建 DataFrame 并确保索引从 0 开始
    train_df = pd.DataFrame({'seq': state, 'len_seq': len_state, 'next': action})
    train_df.reset_index(drop=True, inplace=True)
    
    # 打乱数据
    train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)

    # 打印一些调试信息
    print(f"训练集生成完成，总记录数: {len(train_df)}")
    print(f"前5条训练集记录: \n{train_df.head()}")

    return train_df

# 生成验证集和测试集的函数，并过滤掉 len_seq = 0 的记录
def generate_test_sequences(data, length=10, pad_item=3934):
    state, len_state, action = [], [], []
    
    for user_id, books in data.items():
        history = [book['book_id'] for book in books]
        
        if len(history) > 1:
            s = history[:-1]  # 最后一条作为预测目标，之前的作为历史记录
        else:
            s = []

        if len(s) > 0:  # 只生成有效的历史序列
            len_state.append(len(s) if len(s) < length else length)  # 保存历史序列的长度
            s = pad_history(s, length, pad_item)  # 填充或截取历史序列

            state.append(s)
            action.append(history[-1])  # 最后一条作为预测目标
    
    # 创建 DataFrame 并确保索引从 0 开始
    test_df = pd.DataFrame({'seq': state, 'len_seq': len_state, 'next': action})
    test_df.reset_index(drop=True, inplace=True)

    # 打印一些调试信息
    print(f"验证/测试集生成完成，总记录数: {len(test_df)}")
    print(f"前5条验证/测试集记录: \n{test_df.head()}")

    return test_df

# 检查数据集
def check_data(df, name):
    print(f"{name} 集合长度: {len(df)}")
    print(f"{name} 集合中前 3 条记录:\n{df.head(3)}")
    print(f"{name} 中 len_seq 的最小值: {df['len_seq'].min()}")
    assert df['len_seq'].min() > 0, f"{name} 集合中有 len_seq = 0 的记录！"

# 主流程
file_path = 'remapped_ya_user_sessions.json'
data = load_data(file_path)

# 假设用户按 8:1:1 划分为训练、验证和测试集
total_users = list(data.keys())
fractions = [0.8, 0.1, 0.1]
train_users, val_users, test_users = np.split(total_users, [int(0.8*len(total_users)), int(0.9*len(total_users))])

# 根据用户划分生成训练集、验证集和测试集
train_data = {user: data[user] for user in train_users}
val_data = {user: data[user] for user in val_users}
test_data = {user: data[user] for user in test_users}

# 生成并保存训练集
train_df = generate_train_sequences(train_data)
train_df.to_pickle('/workspace/teenager/train_data.df')

# 生成并保存验证集
val_df = generate_test_sequences(val_data)
val_df.to_pickle('/workspace/teenager/val_data.df')

# 生成并保存测试集
test_df = generate_test_sequences(test_data)
test_df.to_pickle('/workspace/teenager/test_data.df')

# 检查生成的 DataFrame
check_data(train_df, "训练集")
check_data(val_df, "验证集")
check_data(test_df, "测试集")

print("数据集生成并保存完成。")


训练集生成完成，总记录数: 176176
前5条训练集记录: 
                                                 seq  len_seq  next
0   [17, 89, 573, 461, 96, 633, 634, 685, 445, 2036]       10   642
1  [100, 2373, 3432, 1428, 2388, 80, 81, 83, 3390...       10  3562
2  [621, 1121, 454, 692, 721, 1899, 2203, 151, 13...       10   188
3  [1071, 84, 471, 2270, 1326, 3413, 1163, 1414, ...       10  2503
4  [3354, 374, 1165, 2038, 1119, 922, 937, 3669, ...       10   813
验证/测试集生成完成，总记录数: 100
前5条验证/测试集记录: 
                                                 seq  len_seq  next
0  [3345, 1914, 1305, 2032, 2571, 258, 771, 246, ...       10  1292
1  [2463, 108, 2763, 736, 388, 238, 572, 1450, 70...       10   153
2  [1465, 2431, 250, 2430, 742, 251, 837, 2677, 7...       10  2532
3  [111, 2918, 2164, 439, 613, 1189, 709, 3195, 1...       10  2991
4  [2836, 881, 1469, 880, 2292, 891, 767, 2972, 1...       10  2535
验证/测试集生成完成，总记录数: 100
前5条验证/测试集记录: 
                                                 seq  len_seq  next
0  [1343, 875,

In [40]:
import re

def clean_id2name_remove_brackets(file_path, output_path):
    cleaned_data = []
    
    # 定义一个正则表达式，匹配括号及其内部的内容
    pattern = r'\s*\(.*?\)'

    # 读取并清洗数据
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('::')
            if len(parts) == 2:  # 确保有 id 和 book_name 两部分
                book_id, book_name = parts
                if book_name:  # 确保书名不为空
                    # 使用正则表达式去掉括号和其中的内容
                    book_name_cleaned = re.sub(pattern, '', book_name).strip()
                    cleaned_data.append(f"{book_id}::{book_name_cleaned}")
    
    # 保存清洗后的数据
    with open(output_path, 'w', encoding='utf-8') as f:
        for line in cleaned_data:
            f.write(f"{line}\n")
    
    print(f"清洗完成，已保存到 {output_path}")
    print(f"清洗后条目数: {len(cleaned_data)}")

# 调用函数进行清洗
file_path = '/workspace/teenager/id2name.txt'
output_path = '/workspace/teenager/id2name_cleaned_no_brackets.txt'

clean_id2name_remove_brackets(file_path, output_path)


清洗完成，已保存到 /workspace/teenager/id2name_cleaned_no_brackets.txt
清洗后条目数: 3934
