### Gamer crawler
1. latest 10 pages
2. Each title with URL
3. Stars and comments in each article

### URL reference
* Get comment amount
    >- https://gnn.gamer.com.tw/ajax/gnn-html.php?sn=235369
    >- res.json()['data']['comment'] to get HTML then extract the comment amount number
* Get star amount
    >- https://wall.gamer.com.tw/api/link_post.php?url=https://gnn.gamer.com.tw/detail.php?sn=235369
    >- {"data":{"like":3,"isLike":0}}
    ```json
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
    "referer": "https://gnn.gamer.com.tw/detail.php?sn=235369",
    "cookie": "_gid=GA1.3.800346094.1658831722; __gads=ID=f03b6a46efed39d5:T=1658831737:S=ALNI_MZ0uW5i3TTNa5riIFAdxi5W1eSdhQ; buap_modr=p014; ckForumListOrder=post; ckAPP_VCODE=1328; ckBahamutCsrfToken=fa82599a4bbb0ff1; __gpi=UID=000008175adc3c2a:T=1658831737:RT=1658887731:S=ALNI_MbDpsVR6L5mfXM2uzY9StYVx2wcfQ; ckBahaAd=0------------------------; ckBH_lastBoard=[[%225191%22%2C%22%E6%96%B0%E5%A4%A9%E7%BF%BC%E4%B9%8B%E9%8D%8A%EF%BC%88TalesWeaver%EF%BC%89%22]%2C[%227650%22%2C%22%E6%96%B0%E6%A5%93%E4%B9%8B%E8%B0%B7%22]]; buap_puoo=p301%20p103; _ga=GA1.1.1951735684.1658831722; _ga_2Q21791Y9D=GS1.1.1658904159.4.1.1658906035.60"
}
    ```

In [54]:
import pandas as pd
import requests
import random
import pprint
import json
import time
import re
import os
from bs4 import BeautifulSoup
from urllib.parse import unquote

In [47]:
class GamerCrawler:

    def __init__(self):
        self.cxSeries = "partner-pub-9012069346306566:kd3hd85io9c"
        self.cseToken = None  # 3e1664f444e6eb06
        self.cselibVersion = None  # AB1-RNUa21L3zW3YjE3vq6xHeRVG:1658974378322
        self.cseTokenUsedCount = 0

    def genCseToken(self, force=False) -> tuple:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
        }
        
        url = "https://cse.google.com/cse.js?cx=partner-pub-9012069346306566:kd3hd85io9c"
        
        cseTokenRePattern = r'"cse_token":[\s]* "[0-9a-zA-Z\-\_:]*"'
        cselibVersionRePattern = r'"cselibVersion":[\s]* "[0-9a-zA-Z\-\_:]*"'
        
        if self.cseToken == None or self.cseTokenUsedCount > 5 or force:
            res = requests.get(url, headers=headers)
            matchCseTokenObj = re.search(cseTokenRePattern, res.text)
            matchCselibVersionObj = re.search(cselibVersionRePattern, res.text)
            self.cseToken = json.loads("{" + matchCseTokenObj.group(0) + "}")["cse_token"]
            self.cselibVersion = json.loads("{" + matchCselibVersionObj.group(0) + "}")["cselibVersion"]
            self.cseTokenUsedCount = 0
        else:
            self.cseTokenUsedCount += 1
        
        return (self.cseToken, self.cselibVersion)


    def extractCseText(self, q:str, start = 0, sort = 'date') -> str:
        self.genCseToken()
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
        }
        params = {
            'rsz': '10',
            'num': '10',
            'hl': 'zh-TW',
            'source': 'gcsc',
            'gss': '.tw',
#             'start': '40',
            'cselibv': self.cselibVersion,
            'cx': self.cxSeries,
            'q': q + ' more:找新聞',
            'safe': 'active',
            'cse_tok': self.cseToken,
            'sort': sort,
            'exp': 'csqr,cc',
            'rsToken': 'undefined',
            'afsExperimentId': 'undefined',
            'callback': 'google.search.cse.api{}'.format(random.randint(148, 17963))
        }
        if not start == 0:
            params["start"] = str(start)

        url = "https://cse.google.com/cse/element/v1"
        res = requests.get(url, params=params, headers=headers)

        return res.text


    def cseTextToJson(self, cseText:str) -> dict:
        rePattern = r'/\*O_o\*/[\r\n]+google.search.cse.api[0-9]+\('
        replaceTo = ''
        resultJsonStr = re.sub(rePattern, replaceTo, cseText)[:-2]

        return json.loads(resultJsonStr)

    def getCommentAmount(self, articleSn:int) -> int:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
        }

        url = "https://gnn.gamer.com.tw/ajax/gnn-html.php?sn={}".format(articleSn)
        res = requests.get(url, headers=headers)
        oriCommentAmountStr = BeautifulSoup(res.json()['data']['comment']).select_one('p').text  # 顯示所有的 6 則評語
        if '顯示所有的' in oriCommentAmountStr:
            commentAmountStr = oriCommentAmountStr.replace('顯示所有的', '').replace('則評語', '')
            return int(commentAmountStr)
        else:
            return BeautifulSoup(res.json()['data']['comment']).select('p').__len__() - 1


    def getLikeAmount(self, articleSn:int) -> int:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
            "referer": "https://gnn.gamer.com.tw/detail.php?sn={}".format(articleSn),
            "cookie": "_gid=GA1.3.800346094.1658831722; __gads=ID=f03b6a46efed39d5:T=1658831737:S=ALNI_MZ0uW5i3TTNa5riIFAdxi5W1eSdhQ; buap_modr=p014; ckForumListOrder=post; ckAPP_VCODE=1328; ckBahamutCsrfToken=fa82599a4bbb0ff1; __gpi=UID=000008175adc3c2a:T=1658831737:RT=1658887731:S=ALNI_MbDpsVR6L5mfXM2uzY9StYVx2wcfQ; ckBahaAd=0------------------------; ckBH_lastBoard=[[%225191%22%2C%22%E6%96%B0%E5%A4%A9%E7%BF%BC%E4%B9%8B%E9%8D%8A%EF%BC%88TalesWeaver%EF%BC%89%22]%2C[%227650%22%2C%22%E6%96%B0%E6%A5%93%E4%B9%8B%E8%B0%B7%22]]; buap_puoo=p301%20p103; _ga=GA1.1.1951735684.1658831722; _ga_2Q21791Y9D=GS1.1.1658904159.4.1.1658906035.60"
        }
        params={
            "url": "https://gnn.gamer.com.tw/detail.php?sn={}".format(articleSn)
        }

        url = "https://wall.gamer.com.tw/api/link_post.php"
        res = requests.get(url, params=params, headers=headers)

        return res.json()['data']['like']


    def getPublishedDate(self, articleSn:int) -> str:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
        }

        url = "https://gnn.gamer.com.tw/detail.php?sn={}".format(articleSn)
        res = requests.get(url, headers=headers)
        soup = BeautifulSoup(res.text, 'html.parser')

        for jsonData in json.loads(soup.select_one('script[type="application/ld+json"]').text):
            if 'datePublished' in jsonData:
                return jsonData['datePublished']



In [38]:
queryKeyWord = "天翼之鍊"

gc = GamerCrawler()

extractedCseText = gc.extractCseText(q=queryKeyWord, start=0)
resultData = gc.cseTextToJson(extractedCseText)

pprint.pprint(resultData)

{'context': {'facets': [{'anchor': '找新聞',
                         'count': '303',
                         'label': '找新聞',
                         'label_with_op': 'more:找新聞'},
                        {'anchor': '找文章',
                         'count': '0',
                         'label': '找文章',
                         'label_with_op': 'more:找文章'},
                        {'anchor': '找看板',
                         'count': '0',
                         'label': '找看板',
                         'label_with_op': 'more:找看板'},
                        {'anchor': '找公會',
                         'count': '0',
                         'label': '找公會',
                         'label_with_op': 'more:找公會'},
                        {'anchor': '找勇者',
                         'count': '0',
                         'label': '找勇者',
                         'label_with_op': 'more:找勇者'},
                        {'anchor': '找創作',
                         'count': '0',
                         'label'

In [66]:
for i in resultData['results']:
    print(i['richSnippet']['metatags']['ogTitle'])
    print("++++++++++++++++++")
    print(i['title'])
    print("========")

【G★2019】輕鬆放置《加油吧，突擊！仙境傳說 2》試玩影片 現場展出眾多波利周邊
++++++++++++++++++
【G  2019】輕鬆放置《加油吧，突擊！仙境傳說2》試玩影片現場展 ...
【試玩】趣味動作遊戲《準時下班 ONLINE》與同事們起合作拚下班！
++++++++++++++++++
【試玩】趣味動作遊戲《準時下班ONLINE》與同事們起合作拚下班 ...
像素風動作遊戲《非靈魂 Unsouled》亮相 預計 2020 年春季上市
++++++++++++++++++
像素風動作遊戲《非靈魂Unsouled》亮相預計2020 年春季上市 ...
融入中國妖怪文化解謎遊戲《山海旅人》近日公開 預計 2020 年上市
++++++++++++++++++
融入中國妖怪文化解謎遊戲《山海旅人》近日公開預計2020 年上市 ...
《靈魂之聲 Online》推出「今日攻擊隊」隨機系統與全新啤酒節慶典活動
++++++++++++++++++
《靈魂之聲Online》推出「今日攻擊隊」隨機系統與全新啤酒節慶典 ...
【G★2019】NEXON 14 年來將首度缺席韓國最大遊戲展 G-STAR
++++++++++++++++++
【G  2019】NEXON 14 年來將首度缺席韓國最大遊戲展G-STAR - 巴 ...
《天翼之鍊》韓版今日改版推出 EPISODE 4《變奏》第一章 開場動畫曝光
++++++++++++++++++
《<b>天翼之鍊</b>》韓版今日改版推出EPISODE 4《變奏》第一章開場動畫 ...
【電玩瘋】2019 下半年十大期待手機遊戲
++++++++++++++++++
【電玩瘋】2019 下半年十大期待手機遊戲- 巴哈姆特
《天翼之鍊 M》揭露更多遊戲細節 上市時程仍未確定
++++++++++++++++++
《<b>天翼之鍊</b>M》揭露更多遊戲細節上市時程仍未確定 ...
《天翼之鍊》韓版公開 EPISODE 4《變奏》宣傳影片 預告《天翼之鍊 M》即將登場
++++++++++++++++++
《<b>天翼之鍊</b>》韓版公開EPISODE 4《變奏》宣傳影片預告《<b>天翼之鍊</b> ...


In [37]:
gc.genCseToken()

('AB1-RNVnJ8j_uoCfB_H_JpZ-KrLQ:1658978375256', '3e1664f444e6eb06')

In [95]:
# for articleObj in resultData['results']:
#     title = articleObj['title']
#     articleUrl = articleObj['unescapedUrl']
#     sn = ''
#     commentAmount = -1
#     likeAmount = -1
    
#     # Get SN number
#     if 'sn=' in articleUrl:
#         sn = articleUrl.split('sn=')[1]
    
#     print(title)
#     print(articleUrl)
#     print("SN:", sn)
#     print("commentAmount:", commentAmount)
#     print("likeAmount:", likeAmount)
#     print("========")
    


《夢魘騎士團- REMASTER 版-》於手機平台推出可免費遊玩至第4 章 ...
https://gnn.gamer.com.tw/detail.php?sn=235369
SN: 235369
commentAmount: -1
likeAmount: -1
定位技術RPG《變形金剛聯盟》正式於日本推出在現實世界中召喚 ...
https://gnn.gamer.com.tw/detail.php?sn=235377
SN: 235377
commentAmount: -1
likeAmount: -1
《R2M：重燃戰火》今正式上線承襲《R2》經典玩法「據點攻城戰 ...
https://gnn.gamer.com.tw/detail.php?sn=235317
SN: 235317
commentAmount: -1
likeAmount: -1
《R2M：重燃戰火》正式上市倒數1 天提前開放事前下載《R2 Mobile ...
https://gnn.gamer.com.tw/detail.php?sn=235279
SN: 235279
commentAmount: -1
likeAmount: -1
GNN 新聞網- 巴哈姆特
https://gnn.gamer.com.tw/?k=1&p=1
SN: 
commentAmount: -1
likeAmount: -1
魔物娘RPG《四葉草劇場》確定8/3 在日本推出釋出角色介紹PV 第三彈
https://gnn.gamer.com.tw/detail.php?sn=235074
SN: 235074
commentAmount: -1
likeAmount: -1
《黑色沙漠MOBILE》仲夏之夜回饋活動「感謝之夜」第三波登場 ...
https://gnn.gamer.com.tw/detail.php?sn=234945
SN: 234945
commentAmount: -1
likeAmount: -1
《<b>天翼之鍊</b>：SecondRun》發表會過程與試玩記錄18 年前的經典劇情 ...
https://gnn.gamer.com.tw/detail.php?sn=234853
SN: 234853
commentAmount: -1
likeAmount: -1
承襲《R2》IP

In [118]:
# for articleObj in resultData['results']:
#     title = articleObj['title']
#     articleUrl = articleObj['unescapedUrl']
#     sn = ''
#     commentAmount = -1
#     likeAmount = -1
#     publishedDate = None
    
#     # Get SN number
#     if 'sn=' in articleUrl:
#         sn = articleUrl.split('sn=')[1]
#     else:
#         continue
    
#     try:
#         time.sleep(random.randint(2, 5))
#         commentAmount = getCommentAmount(sn)
#     except Exception as e:
#         print(e.args)
#         commentAmount = -1
        
#     try:
#         time.sleep(random.randint(2, 5))
#         likeAmount = getLikeAmount(sn)
#     except Exception as e:
#         print(e.args)
#         likeAmount = -1
        
#     try:
#         time.sleep(random.randint(2, 5))
#         publishedDate = getPublishedDate(sn)
#     except Exception as e:
#         print(e.args)
#         publishedDate = None
        
#     print(title)
#     print(articleUrl)
#     print("SN:", sn)
#     print("commentAmount:", commentAmount)
#     print("likeAmount:", likeAmount)
#     print("publishedDate:", publishedDate)
#     print("========")
    
#     time.sleep(random.randint(2, 20))

《夢魘騎士團- REMASTER 版-》於手機平台推出可免費遊玩至第4 章 ...
https://gnn.gamer.com.tw/detail.php?sn=235369
SN: 235369
commentAmount: 8
likeAmount: 3
publishedDate: 2022-07-27T11:22:58+08:00
定位技術RPG《變形金剛聯盟》正式於日本推出在現實世界中召喚 ...
https://gnn.gamer.com.tw/detail.php?sn=235377
SN: 235377
commentAmount: 14
likeAmount: 1
publishedDate: 2022-07-27T12:09:01+08:00
《R2M：重燃戰火》今正式上線承襲《R2》經典玩法「據點攻城戰 ...
https://gnn.gamer.com.tw/detail.php?sn=235317
SN: 235317
commentAmount: 16
likeAmount: 2
publishedDate: 2022-07-26T12:54:35+08:00
《R2M：重燃戰火》正式上市倒數1 天提前開放事前下載《R2 Mobile ...
https://gnn.gamer.com.tw/detail.php?sn=235279
SN: 235279
commentAmount: 8
likeAmount: 0
publishedDate: 2022-07-25T17:39:17+08:00
魔物娘RPG《四葉草劇場》確定8/3 在日本推出釋出角色介紹PV 第三彈
https://gnn.gamer.com.tw/detail.php?sn=235074
SN: 235074
commentAmount: 15
likeAmount: 9
publishedDate: 2022-07-21T14:17:58+08:00
《黑色沙漠MOBILE》仲夏之夜回饋活動「感謝之夜」第三波登場 ...
https://gnn.gamer.com.tw/detail.php?sn=234945
SN: 234945
commentAmount: 1
likeAmount: 8
publishedDate: 2022-07-19T17:45:40+0

In [48]:
queryKeyWord = "天翼之鍊"

gc = GamerCrawler()

columns = ["title", "like_amount", "comment_amount", "published_date", "article_url"]
data = list()

for offSet in range(0, 10):
    extractedCseText = gc.extractCseText(q=queryKeyWord, start=offSet * 10)
    resultData = gc.cseTextToJson(extractedCseText)
    
    if 'results' not in resultData:
        print("-----------------------")
        print("Regenerate cse_token...")
        print(gc.genCseToken(force=True))
        print("-----------------------")
        
    for articleObj in resultData['results']:
        title = ""
        try:
            title = articleObj['richSnippet']['metatags']['ogTitle']
        except:
            title = articleObj['title']
        
        articleUrl = articleObj['unescapedUrl']
        sn = ''
        commentAmount = -1
        likeAmount = -1
        publishedDate = None

        # Get SN number
        if 'sn=' in articleUrl:
            sn = articleUrl.split('sn=')[1]
        else:
            continue

        try:
            time.sleep(random.randint(2, 5))
            commentAmount = gc.getCommentAmount(sn)
        except Exception as e:
            print(e.args)
            commentAmount = -1

        try:
            time.sleep(random.randint(2, 5))
            likeAmount = gc.getLikeAmount(sn)
        except Exception as e:
            print(e.args)
            likeAmount = -1

        try:
            time.sleep(random.randint(2, 5))
            publishedDate = gc.getPublishedDate(sn)
        except Exception as e:
            print(e.args)
            publishedDate = None

        print(title)
        print(articleUrl)
        print("SN:", sn)
        print("commentAmount:", commentAmount)
        print("likeAmount:", likeAmount)
        print("publishedDate:", publishedDate)
        print("========")
        
        data.append(
            [title, likeAmount, commentAmount, publishedDate, articleUrl]
        )

        time.sleep(random.randint(2, 20))
        
    time.sleep(random.randint(30, 60))

《靈魂潮汐》新版本「永恆的圓舞曲」上線人偶「蕖靈」登場《Soul ...
https://gnn.gamer.com.tw/detail.php?sn=235402
SN: 235402
commentAmount: 1
likeAmount: 6
publishedDate: 2022-07-27T16:08:16+08:00
《天堂2 M》推出「壺精融合系統」及多項活動《Lineage 2M》 - 巴哈 ...
https://gnn.gamer.com.tw/detail.php?sn=235391
SN: 235391
commentAmount: 7
likeAmount: 1
publishedDate: 2022-07-27T13:01:46+08:00
《Valor Legends：Idle RPG》在日本推出夏日祭限時活動即將開幕 ...
https://gnn.gamer.com.tw/detail.php?sn=235421
SN: 235421
commentAmount: 0
likeAmount: 1
publishedDate: 2022-07-27T17:10:00+08:00
定位技術RPG《變形金剛聯盟》正式於日本推出在現實世界中召喚 ...
https://gnn.gamer.com.tw/detail.php?sn=235377
SN: 235377
commentAmount: 16
likeAmount: 2
publishedDate: 2022-07-27T12:09:01+08:00
《R2M：重燃戰火》今正式上線承襲《R2》經典玩法「據點攻城戰 ...
https://gnn.gamer.com.tw/detail.php?sn=235317
SN: 235317
commentAmount: 17
likeAmount: 2
publishedDate: 2022-07-26T12:54:35+08:00
《R2M：重燃戰火》正式上市倒數1 天提前開放事前下載《R2 Mobile ...
https://gnn.gamer.com.tw/detail.php?sn=235279
SN: 235279
commentAmount: 8
likeAmount: 0
publishedDate: 2022-07-25T

WIT STUDIO《花園裡的吸血鬼》預定2022 年Netflix 獨佔推出 ...
https://gnn.gamer.com.tw/detail.php?sn=223656
SN: 223656
commentAmount: 3
likeAmount: 20
publishedDate: 2021-11-09T12:02:46+08:00
樂團敘事遊戲《We are OFK》預計2022 推出描繪4 位好友在音樂生活 ...
https://gnn.gamer.com.tw/detail.php?sn=223067
SN: 223067
commentAmount: 2
likeAmount: 8
publishedDate: 2021-10-28T11:23:50+08:00
《DJMax Respect V》將推出「NEXON DLC」 今起釋出《<b>天翼之鍊</b> ...
https://gnn.gamer.com.tw/detail.php?sn=222941
SN: 222941
commentAmount: 44
likeAmount: 13
publishedDate: 2021-10-25T19:18:40+08:00
恐怖遊戲《霧雨飄零之森》將推出重製版預計2022 年問世《Forest of ...
https://gnn.gamer.com.tw/detail.php?sn=222285
SN: 222285
commentAmount: 8
likeAmount: 15
publishedDate: 2021-10-12T17:59:27+08:00
【TGS 21】影之刃系列新作《影之刃：斷罪者》正式亮相預計2022 ...
https://gnn.gamer.com.tw/detail.php?sn=221932
SN: 221932
commentAmount: 36
likeAmount: 27
publishedDate: 2021-10-04T10:40:45+08:00
融入劇情導向與音樂雷射光戰鬥新作《奇妙逃脫》正式推出《The ...
https://gnn.gamer.com.tw/detail.php?sn=220835
SN: 220835
commentAmount: 1
likeAmount: 6
pub

【G  2019】NEXON 14 年來將首度缺席韓國最大遊戲展G-STAR - 巴 ...
https://gnn.gamer.com.tw/detail.php?sn=184099
SN: 184099
commentAmount: 68
likeAmount: 1
publishedDate: 2019-08-12T18:55:47+08:00
《<b>天翼之鍊</b>》韓版今日改版推出EPISODE 4《變奏》第一章開場動畫 ...
https://gnn.gamer.com.tw/detail.php?sn=182856
SN: 182856
commentAmount: 121
likeAmount: 2
publishedDate: 2019-07-18T16:12:18+08:00
【電玩瘋】2019 下半年十大期待手機遊戲- 巴哈姆特
https://gnn.gamer.com.tw/detail.php?sn=182646
SN: 182646
commentAmount: 59
likeAmount: 0
publishedDate: 2019-07-14T12:00:06+08:00
《<b>天翼之鍊</b>M》揭露更多遊戲細節上市時程仍未確定 ...
https://gnn.gamer.com.tw/detail.php?sn=181777
SN: 181777
commentAmount: 77
likeAmount: 5
publishedDate: 2019-06-27T18:29:14+08:00
《<b>天翼之鍊</b>》韓版公開EPISODE 4《變奏》宣傳影片預告《<b>天翼之鍊</b> ...
https://gnn.gamer.com.tw/detail.php?sn=181421
SN: 181421
commentAmount: 164
likeAmount: 0
publishedDate: 2019-06-20T16:55:41+08:00


In [49]:
print(data.__len__())

97


In [50]:
print(offSet)

9


In [52]:
df = pd.DataFrame(data=data, columns=columns)

In [53]:
df

Unnamed: 0,title,like_amount,comment_amount,published_date,article_url
0,《靈魂潮汐》新版本「永恆的圓舞曲」上線人偶「蕖靈」登場《Soul ...,6,1,2022-07-27T16:08:16+08:00,https://gnn.gamer.com.tw/detail.php?sn=235402
1,《天堂2 M》推出「壺精融合系統」及多項活動《Lineage 2M》 - 巴哈 ...,1,7,2022-07-27T13:01:46+08:00,https://gnn.gamer.com.tw/detail.php?sn=235391
2,《Valor Legends：Idle RPG》在日本推出夏日祭限時活動即將開幕 ...,1,0,2022-07-27T17:10:00+08:00,https://gnn.gamer.com.tw/detail.php?sn=235421
3,定位技術RPG《變形金剛聯盟》正式於日本推出在現實世界中召喚 ...,2,16,2022-07-27T12:09:01+08:00,https://gnn.gamer.com.tw/detail.php?sn=235377
4,《R2M：重燃戰火》今正式上線承襲《R2》經典玩法「據點攻城戰 ...,2,17,2022-07-26T12:54:35+08:00,https://gnn.gamer.com.tw/detail.php?sn=235317
...,...,...,...,...,...
92,【G 2019】NEXON 14 年來將首度缺席韓國最大遊戲展G-STAR - 巴 ...,1,68,2019-08-12T18:55:47+08:00,https://gnn.gamer.com.tw/detail.php?sn=184099
93,《<b>天翼之鍊</b>》韓版今日改版推出EPISODE 4《變奏》第一章開場動畫 ...,2,121,2019-07-18T16:12:18+08:00,https://gnn.gamer.com.tw/detail.php?sn=182856
94,【電玩瘋】2019 下半年十大期待手機遊戲- 巴哈姆特,0,59,2019-07-14T12:00:06+08:00,https://gnn.gamer.com.tw/detail.php?sn=182646
95,《<b>天翼之鍊</b>M》揭露更多遊戲細節上市時程仍未確定 ...,5,77,2019-06-27T18:29:14+08:00,https://gnn.gamer.com.tw/detail.php?sn=181777


In [56]:
df.to_csv(
    "./{}.csv".format("RO 仙境傳說：愛如初見"),
    index=False,
    encoding="utf-8-sig"
)

# TEST

In [4]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}

In [17]:
paramsStr = """rsz: 10
num: 10
hl: zh-TW
source: gcsc
gss: .tw
start: 30
cselibv: 3e1664f444e6eb06
cx: partner-pub-9012069346306566:kd3hd85io9c
q: 天翼之鍊
safe: active
cse_tok: AB1-RNUPMMopHsk5zSF1fmGiJVHz:1658890650200
sort: 
exp: csqr,cc
rsToken: undefined
afsExperimentId: undefined
callback: google.search.cse.api11609"""

params = {r.split(": ")[0]: r.split(": ")[1] for r in dataStr.split("\n")}

In [18]:
"""
https://cse.google.com/cse/element/v1?rsz=10&num=10&hl=zh-TW&source=gcsc&gss=.tw&start=20&cselibv=3e1664f444e6eb06&cx=partner-pub-9012069346306566:kd3hd85io9c&q=%E5%A4%A9%E7%BF%BC%E4%B9%8B%E9%8D%8A&safe=active&cse_tok=AB1-RNUPMMopHsk5zSF1fmGiJVHz:1658890650200&sort=&exp=csqr,cc&rsToken=undefined&afsExperimentId=undefined&callback=google.search.cse.api3968
"""
url = "https://cse.google.com/cse/element/v1"

In [19]:
res = requests.get(url, params=params, headers=headers)
res

<Response [200]>

In [46]:
params

{'rsz': '10',
 'num': '10',
 'hl': 'zh-TW',
 'source': 'gcsc',
 'gss': '.tw',
 'start': '40',
 'cselibv': '3e1664f444e6eb06',
 'cx': 'partner-pub-9012069346306566:kd3hd85io9c',
 'q': '天翼之鍊',
 'safe': 'active',
 'cse_tok': 'AB1-RNUPMMopHsk5zSF1fmGiJVHz:1658890650200',
 'sort': '',
 'exp': 'csqr,cc',
 'rsToken': 'undefined',
 'afsExperimentId': 'undefined',
 'callback': 'google.search.cse.api11609'}

In [20]:
print(res.text)

/*O_o*/
google.search.cse.api11609({
  "cursor": {
    "currentPageIndex": 3,
    "estimatedResultCount": "8020",
    "moreResultsUrl": "http://www.google.com/cse?oe=utf8&ie=utf8&source=uds&q=%E5%A4%A9%E7%BF%BC%E4%B9%8B%E9%8D%8A&safe=active&start=30&sort=&cx=partner-pub-9012069346306566:kd3hd85io9c",
    "resultCount": "8,020",
    "searchResultTime": "0.37",
    "pages": [
      {
        "label": 1,
        "start": "0"
      },
      {
        "label": 2,
        "start": "10"
      },
      {
        "label": 3,
        "start": "20"
      },
      {
        "label": 4,
        "start": "30"
      },
      {
        "label": 5,
        "start": "40"
      },
      {
        "label": 6,
        "start": "50"
      },
      {
        "label": 7,
        "start": "60"
      },
      {
        "label": 8,
        "start": "70"
      },
      {
        "label": 9,
        "start": "80"
      },
      {
        "label": 10,
        "start": "90"
      }
    ]
  },
  "context": {
    "tit

In [21]:
params["start"] = "40"
res = requests.get(url, params=params, headers=headers)
res

<Response [200]>

In [22]:
print(res.text)

/*O_o*/
google.search.cse.api11609({
  "cursor": {
    "currentPageIndex": 4,
    "estimatedResultCount": "8020",
    "moreResultsUrl": "http://www.google.com/cse?oe=utf8&ie=utf8&source=uds&q=%E5%A4%A9%E7%BF%BC%E4%B9%8B%E9%8D%8A&safe=active&start=40&sort=&cx=partner-pub-9012069346306566:kd3hd85io9c",
    "resultCount": "8,020",
    "searchResultTime": "0.36",
    "pages": [
      {
        "label": 1,
        "start": "0"
      },
      {
        "label": 2,
        "start": "10"
      },
      {
        "label": 3,
        "start": "20"
      },
      {
        "label": 4,
        "start": "30"
      },
      {
        "label": 5,
        "start": "40"
      },
      {
        "label": 6,
        "start": "50"
      },
      {
        "label": 7,
        "start": "60"
      },
      {
        "label": 8,
        "start": "70"
      },
      {
        "label": 9,
        "start": "80"
      },
      {
        "label": 10,
        "start": "90"
      }
    ]
  },
  "context": {
    "tit

In [45]:
import re

print(re.sub(r'/\*O_o\*/[\r\n]+google.search.cse.api[0-9]+\(', '', res.text)[:-2])

{
  "cursor": {
    "currentPageIndex": 4,
    "estimatedResultCount": "8020",
    "moreResultsUrl": "http://www.google.com/cse?oe=utf8&ie=utf8&source=uds&q=%E5%A4%A9%E7%BF%BC%E4%B9%8B%E9%8D%8A&safe=active&start=40&sort=&cx=partner-pub-9012069346306566:kd3hd85io9c",
    "resultCount": "8,020",
    "searchResultTime": "0.36",
    "pages": [
      {
        "label": 1,
        "start": "0"
      },
      {
        "label": 2,
        "start": "10"
      },
      {
        "label": 3,
        "start": "20"
      },
      {
        "label": 4,
        "start": "30"
      },
      {
        "label": 5,
        "start": "40"
      },
      {
        "label": 6,
        "start": "50"
      },
      {
        "label": 7,
        "start": "60"
      },
      {
        "label": 8,
        "start": "70"
      },
      {
        "label": 9,
        "start": "80"
      },
      {
        "label": 10,
        "start": "90"
      }
    ]
  },
  "context": {
    "title": "巴哈姆特",
    "total_results": "

In [59]:
urlparse("https://www.google.com/search?client=ms-google-coop&q=%E5%A4%A9%E7%BF%BC%E4%B9%8B%E9%8D%8A&cx=partner-pub-9012069346306566:kd3hd85io9c")

ParseResult(scheme='https', netloc='www.google.com', path='/search', params='', query='client=ms-google-coop&q=%E5%A4%A9%E7%BF%BC%E4%B9%8B%E9%8D%8A&cx=partner-pub-9012069346306566:kd3hd85io9c', fragment='')

In [61]:
unquote("https://www.google.com/search?client=ms-google-coop&q=%E5%A4%A9%E7%BF%BC%E4%B9%8B%E9%8D%8A&cx=partner-pub-9012069346306566:kd3hd85io9c")

'https://www.google.com/search?client=ms-google-coop&q=天翼之鍊&cx=partner-pub-9012069346306566:kd3hd85io9c'

In [108]:
url = "https://gnn.gamer.com.tw/detail.php?sn=235369"
res = requests.get(url, headers=headers)
res

<Response [200]>

In [109]:
print(res.text)

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:og="http://ogp.me/ns#">
<head>
<meta name="viewport" content="width=1250">
<meta charset="UTF-8">
<title>《夢魘騎士團 - REMASTER 版 -》於手機平台推出 可免費遊玩至第 4 章《ナイツ・イン・ザ・ナイトメア-REMASTER版-》 - 巴哈姆特</title>

<meta property="og:site_name" content="巴哈姆特電玩資訊站">
<meta property="fb:app_id" content="668497826514848">
<meta name="description" content="Sting 宣布，《夢魘騎士團 - REMASTER 版 -（ナイツ・イン・ザ・ナイトメア-REMASTER 版-）》已於 App Store 及 Google Play 推出。 本作是 2008 年在 Nintendo DS 推出的《夢魘騎士團》的 Remaster 版，是一款融入即時戰鬥的緊湊感及多樣戰略性的「動態戰術角色扮演遊戲」，遊戲不僅畫質提昇，同時大幅度修改、強化、追加遊戲操作介面。本作能免費體驗">
<meta property="og:description" content="Sting 宣布，《夢魘騎士團 - REMASTER 版 -（ナイツ・イン・ザ・ナイトメア-REMASTER 版-）》已於 App Store 及 Google Play 推出。 本作是 2008 年在 Nintendo DS 推出的《夢魘騎士團》的 Remaster 版，是一款融入即時戰鬥的緊湊感及多樣戰略性的「動態戰術角色扮演遊戲」，遊戲不僅畫質提昇，同時大幅度修改、強化、追加遊戲操作介面。本作能免費體驗"/>
<link href="https://i2.bahamut.com.tw/cs

In [66]:
soup = BeautifulSoup(res.text, 'html.parser')
soup.select('span')

[<span class="nav-search-close-button" onclick="TopBar.hideSearchBar();" tabindex="0">✕</span>,
 <span class="GN-lbox3C">（GNN 記者 Jisho 報導） 2022-07-27 11:22:58</span>,
 <span></span>,
 <span>2022/03/18</span>,
 <span>2021/06/24</span>]

In [113]:
pprint.pprint(json.loads(soup.select_one('script[type="application/ld+json"]').text))

[{'@context': 'http://schema.org',
  '@type': 'WebSite',
  'alternateName': '巴哈姆特',
  'name': '巴哈姆特電玩資訊站',
  'url': 'https://www.gamer.com.tw/'},
 {'@context': 'http://schema.org',
  '@type': 'BreadcrumbList',
  'itemListElement': [{'@type': 'ListItem',
                       'item': {'@id': 'https://gnn.gamer.com.tw/',
                                'name': 'GNN 新聞'},
                       'position': 1},
                      {'@type': 'ListItem',
                       'item': {'@id': 'https://gnn.gamer.com.tw/detail.php?sn=235369',
                                'name': '《夢魘騎士團 - REMASTER 版 -》於手機平台推出 可免費遊玩至第 '
                                        '4 章'},
                       'position': 2}]},
 {'@context': 'http://schema.org',
  '@type': 'NewsArticle',
  'author': {'@type': 'Person', 'name': 'Jisho'},
  'dateModified': '2022-07-27T11:31:21+08:00',
  'datePublished': '2022-07-27T11:22:58+08:00',
  'description': 'Sting 宣布，《夢魘騎士團 - REMASTER 版 -（ナイツ・イン・ザ・ナイトメア-REMASTER '
     

In [114]:
for jsonData in json.loads(soup.select_one('script[type="application/ld+json"]').text):
    if 'datePublished' in jsonData:
        print(jsonData['datePublished'])

2022-07-27T11:22:58+08:00


In [67]:
soup.select('div[class="shareandmore"]')

[<div class="shareandmore">
 <div class="baha_btn-biglike" style="display: none;">
 <div class="likebtn-img" onclick="WallGp.like('235369', 'https://gnn.gamer.com.tw/detail.php?sn=235369');">
 <div class="backpic"></div>
 </div>
 <p><span></span> 個人說推</p>
 </div>
 <div class="sharebtn-container">
 <!-- 推與分享按鈕 開始 -->
 <div class="community_sharebtn"><ul>
 <li data-to="wall" data-url="https://wall.gamer.com.tw/share.php?url=https%3A%2F%2Fgnn.gamer.com.tw%2Fdetail.php%3Fsn%3D235369">
 <img class="share-icon icon-baha" src="https://i2.bahamut.com.tw/icon/share-icon_bh.svg"/>
 </li>
 <li data-to="fb" data-url="https://www.facebook.com/sharer/sharer.php?u=https%3A%2F%2Fgnn.gamer.com.tw%2Fdetail.php%3Fsn%3D235369">
 <img class="share-icon icon-fb" src="https://i2.bahamut.com.tw/icon/share-icon_fb.svg"/>
 </li>
 <li data-to="line" data-url="https://social-plugins.line.me/lineit/share?url=https%3A%2F%2Fgnn.gamer.com.tw%2Fdetail.php%3Fsn%3D235369">
 <img class="share-icon icon-line" src="https:/

In [82]:
url = "https://gnn.gamer.com.tw/ajax/gnn-html.php?sn=235369"
res = requests.get(url, headers=headers)
res

<Response [200]>

In [83]:
print(res.json()['data']['comment'])

<p class="GN-lbox4A"><img src="https://i2.bahamut.com.tw/gnn/bit.gif" class="IMG-E25" /><a href="javascript:get_all_comment(235369)">顯示所有的 6 則評語</a></p><div class="GN-lbox6A"><a href="//home.gamer.com.tw/fredtseng1" target="_blank"><img src="https://avatar2.bahamut.com.tw/avataruserpic/f/r/fredtseng1/fredtseng1_s.png" /></a>
<p><a href="//home.gamer.com.tw/fredtseng1" target="_blank">視黑見白,惟人自縛</a>：<span class="comment-text">樓上+1</span><span>07-27 12:48:37</span></p>
<button type="button" onclick="accuse_comment(235369,'fredtseng1')">檢舉</button></div><div class="GN-lbox6A"><a href="//home.gamer.com.tw/k8990578" target="_blank"><img src="https://avatar2.bahamut.com.tw/avataruserpic/k/8/k8990578/k8990578_s.png" /></a>
<p><a href="//home.gamer.com.tw/k8990578" target="_blank">Inu汪汪</a>：<span class="comment-text">Sting 真希望能有新作啊...以前的公主同盟跟約束之地真的超讚 ! 音樂跟畫風都很優質</span><span>07-27 13:22:21</span></p>
<button type="button" onclick="accuse_comment(235369,'k8990578')">檢舉</button></div><div class="G

In [73]:
for i in BeautifulSoup(res.json()['data']['comment'], 'html.parser').select('p'):
    print(i)
    print("=====")

<p class="GN-lbox4A"><img class="IMG-E25" src="https://i2.bahamut.com.tw/gnn/bit.gif"/><a href="javascript:get_all_comment(235369)">顯示所有的 6 則評語</a></p>
=====
<p><a href="//home.gamer.com.tw/fredtseng1" target="_blank">視黑見白,惟人自縛</a>：<span class="comment-text">樓上+1</span><span>07-27 12:48:37</span></p>
=====
<p><a href="//home.gamer.com.tw/k8990578" target="_blank">Inu汪汪</a>：<span class="comment-text">Sting 真希望能有新作啊...以前的公主同盟跟約束之地真的超讚 ! 音樂跟畫風都很優質</span><span>07-27 13:22:21</span></p>
=====
<p><a href="//home.gamer.com.tw/s8913s" target="_blank">天使的左手惡魔的右手</a>：<span class="comment-text">玩不到就不用說好不好了，而且還沒中文</span><span>07-27 13:41:52</span></p>
=====
<p><a href="//home.gamer.com.tw/ntes1314" target="_blank">嫣紅</a>：<span class="comment-text">玩不到跟沒中文只是你的問題而已</span><span>07-27 14:29:35</span></p>
=====
<p><a href="//home.gamer.com.tw/lili1717" target="_blank">真他媽的有夠難勁爆</a>：<span class="comment-text">台灣還不能下載</span><span>07-27 14:31:09</span></p>
=====
<p class="GN-lbox6B"><input id="comment_inp

In [87]:
commentAmountStr = BeautifulSoup(res.json()['data']['comment']).select_one('p').text.replace('顯示所有的', '').replace('則評語', '')
int(commentAmountStr)

6

In [42]:
url = "https://wall.gamer.com.tw/api/link_post.php"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
    "referer": "https://gnn.gamer.com.tw/detail.php?sn=235369",
    "cookie": "_gid=GA1.3.800346094.1658831722; __gads=ID=f03b6a46efed39d5:T=1658831737:S=ALNI_MZ0uW5i3TTNa5riIFAdxi5W1eSdhQ; buap_modr=p014; ckForumListOrder=post; ckAPP_VCODE=1328; ckBahamutCsrfToken=fa82599a4bbb0ff1; __gpi=UID=000008175adc3c2a:T=1658831737:RT=1658887731:S=ALNI_MbDpsVR6L5mfXM2uzY9StYVx2wcfQ; ckBahaAd=0------------------------; ckBH_lastBoard=[[%225191%22%2C%22%E6%96%B0%E5%A4%A9%E7%BF%BC%E4%B9%8B%E9%8D%8A%EF%BC%88TalesWeaver%EF%BC%89%22]%2C[%227650%22%2C%22%E6%96%B0%E6%A5%93%E4%B9%8B%E8%B0%B7%22]]; buap_puoo=p301%20p103; _ga=GA1.1.1951735684.1658831722; _ga_2Q21791Y9D=GS1.1.1658904159.4.1.1658906035.60"
}

res = requests.get(url, headers=headers, params={"url": "https://gnn.gamer.com.tw/detail.php?sn=235369"})
res

<Response [200]>

In [43]:
print(res.text)

{"data":{"like":4,"isLike":0}}


In [90]:
res.json()['data']['like']

3

In [99]:
url = "https://gnn.gamer.com.tw/ajax/gnn-html.php?sn=234945"
res = requests.get(url, headers=headers)
res

<Response [200]>

In [100]:
print(res.text)

{"data":{"comment":"<div class=\"GN-lbox6A\"><a href=\"\/\/home.gamer.com.tw\/kakaka7060\" target=\"_blank\"><img src=\"https:\/\/avatar2.bahamut.com.tw\/avataruserpic\/k\/a\/kakaka7060\/kakaka7060_s.png\" \/><\/a>\n<p><a href=\"\/\/home.gamer.com.tw\/kakaka7060\" target=\"_blank\">\u963fQ<\/a>\uff1a<span class=\"comment-text\">\u6211\u8001\u5a46<\/span><span>07-19 17:47:36<\/span><\/p>\n<button type=\"button\" onclick=\"accuse_comment(234945,'kakaka7060')\">\u6aa2\u8209<\/button><\/div><div id=\"reply_comment\"><\/div><p class=\"GN-lbox6B\"><input onkeypress=\"enterKey(event,234945)\" name=\"comment_input\" id=\"comment_input\" type=\"text\" placeholder=\"\u767c\u8868\u65b0\u805e\u8a55\u8a9e...\uff08\u965050\u5b57\uff09\"><button type=\"button\" onclick=\"comment_ck(234945)\" id=\"comment_btn\">\u767c\u8868<\/button><\/p>"}}


In [101]:
print(res.json()['data']['comment'])

<div class="GN-lbox6A"><a href="//home.gamer.com.tw/kakaka7060" target="_blank"><img src="https://avatar2.bahamut.com.tw/avataruserpic/k/a/kakaka7060/kakaka7060_s.png" /></a>
<p><a href="//home.gamer.com.tw/kakaka7060" target="_blank">阿Q</a>：<span class="comment-text">我老婆</span><span>07-19 17:47:36</span></p>
<button type="button" onclick="accuse_comment(234945,'kakaka7060')">檢舉</button></div><div id="reply_comment"></div><p class="GN-lbox6B"><input onkeypress="enterKey(event,234945)" name="comment_input" id="comment_input" type="text" placeholder="發表新聞評語...（限50字）"><button type="button" onclick="comment_ck(234945)" id="comment_btn">發表</button></p>


In [104]:
BeautifulSoup(res.json()['data']['comment'], 'html.parser').select('p').__len__()

2

In [7]:
url = "https://cse.google.com/cse.js?cx=partner-pub-9012069346306566:kd3hd85io9c"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}

res = requests.get(url, headers=headers)
res

<Response [200]>

In [8]:
res.text

'(function(opts_){/*\n\n Copyright The Closure Library Authors.\n SPDX-License-Identifier: Apache-2.0\n*/\nvar g=this||self,k=function(a,b){function d(){}d.prototype=b.prototype;a.l=b.prototype;a.prototype=new d;a.prototype.constructor=a;a.j=function(e,c,f){for(var h=Array(arguments.length-2),q=2;q<arguments.length;q++)h[q-2]=arguments[q];return b.prototype[c].apply(e,h)}},l=function(a){return a};function m(a){if(Error.captureStackTrace)Error.captureStackTrace(this,m);else{var b=Error().stack;b&&(this.stack=b)}a&&(this.message=String(a))}k(m,Error);m.prototype.name="CustomError";var n=function(a,b){a=a.split("%s");for(var d="",e=a.length-1,c=0;c<e;c++)d+=a[c]+(c<b.length?b[c]:"%s");m.call(this,d+a[e])};k(n,m);n.prototype.name="AssertionError";var p=function(a,b,d){if(!a){var e="Assertion failed";if(b){e+=": "+b;var c=Array.prototype.slice.call(arguments,2)}throw new n(""+e,c||[]);}},r=function(a,b){throw new n("Failure"+(a?": "+a:""),Array.prototype.slice.call(arguments,1));};var t;var

In [31]:
m = re.search(r'"cse_token":[\s]* "[0-9a-zA-Z\-\_:]*"', res.text)
json.loads("{" + m.group(0) + "}")["cse_token"]

'AB1-RNVrCdTid_v-gQcZc0d6Liof:1658976420680'

In [32]:
m = re.search(r'"cselibVersion":[\s]* "[0-9a-zA-Z\-\_:]*"', res.text)
json.loads("{" + m.group(0) + "}")["cselibVersion"]

'3e1664f444e6eb06'