# Bilibili Web Scraper

In [None]:
import requests
import json
import time
import pandas as pd
from IPython.display import clear_output

headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}

## Video

In [None]:
def video_url(page, oid):
    # default page size is 20
    return "https://api.bilibili.com/x/v2/reply/main?next=" + str(page) + "&type=1&oid=" + str(oid) + "&mode=3"

page = 0
oid = 712909579 #《原神》角色演示-「钟离：听书人」

response = requests.get(url = video_url(page, oid), headers = headers)
result = json.loads(response.text)
total = result["data"]["cursor"]["all_count"]

start = time.time()
df = pd.DataFrame(columns = ["name", "level", "datetime", "content", "like", "rcount"])
while not result["data"]["cursor"]["is_end"]:
    for reply in result["data"]["replies"]:
        df2 = pd.DataFrame({
            "name": [reply["member"]["uname"]],
            "level": [reply["member"]["level_info"]["current_level"]],
            "datetime": [time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(reply["ctime"]))],
            "content": [reply["content"]["message"]],
            "like": [reply["like"]],
            "rcount": [reply["rcount"]]
        })
        df = pd.concat([df, df2], ignore_index = True)
        df.reset_index()
        
        clear_output()
        print(reply["content"]["message"])
        print(str(df.shape[0]) + "/" + str(total) + ", currently on page " + str(page))
        
        now = time.time()
        h = (now - start) // 3600
        m = ((now - start) % 3600) // 60
        s = (now - start) % 60
        print("{:0>2}:{:0>2}:{:05.2f}".format(int(h), int(m), s))
    
    page += 1
    response = requests.get(url = video_url(page, oid), headers = headers)
    result = json.loads(response.text)

df.drop_duplicates(subset = ["name", "datetime", "content"], keep = "first", inplace = True)
df.sort_values(by = ["datetime"], inplace = True)
df.head()

In [None]:
df.to_csv("Zhong_Li.csv")

## Game Center

In [None]:
def game_url(page, game_id):
    # default page size is 10
    return "https://line3-h5-pc-api.biligame.com/game/comment/page?game_base_id=" + str(game_id) + "&rank_type=2&page_num=" + str(page)

game_id = 103496 # 原神

response = requests.get(url = game_url(1, game_id), headers = headers)
result = json.loads(response.text)
page_count = result["data"]["page_count"]

start = time.time()
df = pd.DataFrame(columns = ["name", "level", "datetime", "content", "grade", "up", "down", "rcount"])
for i in range(page_count, 0, -1):
    response = requests.get(url = game_url(i, game_id), headers = headers)
    result = json.loads(response.text)
    for reply in result["data"]["list"]:
        df2 = pd.DataFrame({
            "name": [reply["user_name"]],
            "level": [reply["user_level"]],
            "datetime": [reply["publish_time"]],
            "content": [reply["content"]],
            "grade": [reply["grade"]],
            "up": [reply["up_count"]],
            "down": [reply["down_count"]],
            "rcount": [len(reply["reply_list"])]
        })
        df = pd.concat([df, df2], ignore_index = True)
        df.reset_index()
        
        clear_output()
        print(reply["content"])
        print(str(df.shape[0]) + "/" + str(page_count * 10) + ", currently on page " + str(i))
        
        now = time.time()
        h = (now - start) // 3600
        m = ((now - start) % 3600) // 60
        s = (now - start) % 60
        print("{:0>2}:{:0>2}:{:05.2f}".format(int(h), int(m), s))

df.drop_duplicates(subset = ["name", "datetime", "content"], keep = "first", inplace = True)
df.sort_values(by = ["datetime"], inplace = True)
df.head()

In [None]:
df.to_csv("Genshin_Impact.csv")