In [99]:
import warnings
warnings.filterwarnings('ignore')

import requests
import json
import pandas as pd
from bs4 import BeautifulSoup

from datetime import datetime


## 明星主页

In [151]:
def scrape_starpage(cookie: str):
    names = ['tnt', 'mjq', 'dcx', 'syx', 'lyw', 'zzy', 'yhx', 'hjl']

    result = pd.DataFrame()
    result['rank'] = list(range(1,101))

    total_number = []
    headers = {'Cookie': cookie}
    
    for i in range(1, 9):
        session = requests.Session()

        #url for every page
        url = 'https://www.tfent.cn/TYFansClub/pageRank.html?uid=' + str(i) 
        #scrape data
        response = session.get(url, headers=headers)
        html_doc = response.text
        soup = BeautifulSoup(html_doc, 'html.parser')
        #get total number of sunflower this month so far
        if i == 1: 
            total = soup.findAll('div', {'class': 'unlock'})
        else:
            total = soup.findAll('div', {'class': 'unlock '+ names[i-1]})
        total_number.append(total[0].find('p').get_text().split("：")[1])

        # top 100 contribution 
        rank = soup.findAll('div', {'class':'board'})
        allrank = rank[0].find_all('div', {'class':['rank-number', 'name', 'value']})
        cleaned_rank = []
        for p in range(len(allrank)):
            cleaned_rank.append(allrank[p].get_text())

        columns = ["rank", names[i-1]+"_name", names[i-1]+"_sunflower"]
        temp = pd.DataFrame(columns = columns)

        #rearrange data to correct column
        for j in range(len(cleaned_rank)):
            if j % 3 == 0:
                temp.loc[j, 'rank'] = int(cleaned_rank[j])
            elif j % 3 == 1:
                temp.loc[j-1, columns[1]] = cleaned_rank[j]
            elif j % 3 == 2:
                temp.loc[j-2, columns[2]] = cleaned_rank[j][:-3]

        #append this page's data to ultimate outputs
        result = pd.merge(result, temp, on="rank")
        print("Page", names[i-1], "is scraped.")
    return total_number, result



In [152]:
cookie = "PHPSESSID=rlb5bqd31vdcknc2ig6oe4q5n3; UM_distinctid=177692148b144c-04833e79061027-163c655d-13c680-177692148b2987; msgPage=3; CNZZDATA1274712623=1121232681-1612376524-%7C1612539378"
total_number, result = scrape_starpage(cookie)



Page tnt is scraped.
Page mjq is scraped.
Page dcx is scraped.
Page syx is scraped.
Page lyw is scraped.
Page zzy is scraped.
Page yhx is scraped.
Page hjl is scraped.


In [153]:
def save_starpage(folder, result):
    today = datetime.today().strftime('%Y_%m_%d_%H_%M')

    filename = folder + "/starPage_" + str(today) + '.csv'
    result.to_csv(filename, index=False, encoding='utf-8')
    return

save_starpage("二月小葵明星主页", result)


In [165]:
#小葵花总数

def save_total(filename, total_number):
    try:
        total = pd.read_csv(filename)
    except FileNotFoundError:
        total = pd.DataFrame(columns = ['date', 'time', 'tnt', 'mjq', 'dcx', 'syx', 'lyw', 'zzy', 'yhx', 'hjl'])
    row = [datetime.today().strftime('%m/%d/%y'), datetime.today().strftime('%H:%M')]
    for i in total_number:
        row.append(i)

    row = pd.DataFrame([row], columns = total.columns)
    total = total.append(row, ignore_index=True)
    total.to_csv(filename, index=False)
    return
    
    
save_total('totalSunflower_2021年2月.csv', total_number)

## 留言板

In [166]:
#may take days
def scrape_comments(cookie: str): 
    headers = {
    'Cookie': cookie
    }
    comments = pd.DataFrame(columns = ['user', 'time', 'title', 'content', 'likes', 'replies'])

    #need to update total date range
    for i in range(1, 21145):
        session = requests.Session()

        url = 'https://www.tfent.cn/TYFansClub/messages?page=' + str(i)
        #scrape data
        response = session.get(url, headers=headers)
        html_doc = response.text
        soup = BeautifulSoup(html_doc, 'html.parser')


        message = soup.findAll('ul', {'class': 'messageList'})

        message = message[0].find_all('div', {'class':['info']})
        for m in message:

            user = m.find_all('h3', {'class':['name lf']})[0].get_text()
            time = m.find_all('span', {'class':['time lf']})[0].get_text()
            title = m.find_all('h3', {'class':['tit']})[0].get_text()
            content = m.find_all('p')[0].get_text()
            likes = int(m.find_all('a', {'class':['tp']})[0].get_text())
            replies =  int(m.find_all('a', {'class':['cm']})[0].get_text())

            record = [[user, time, title, content, likes, replies]]
            record = pd.DataFrame(record,columns=['user', 'time', 'title', 'content', 'likes', 'replies'])
            comments = comments.append(record, ignore_index=True)

        if i % 50 == 0:
            print(i, 'pages scraped.')
    
    comments.to_csv('comments.csv')
    return

In [98]:
comments.tail()

Unnamed: 0,user,time,title,content,likes,replies
105715,长风拂棋,2018/10/09 22:03,to祺祺，程程，tyt,小朋友们加油啊，姐姐一直支持你们哦，姐姐还把两个从不追星的闺蜜拉进来了，都是你们太优秀啦，群...,1,0
105716,狼崽的猪猪是...,2018/10/09 21:58,狼崽,鹅子，晚安，早点休息，坚持，加油，你会越来越好的，不要有太大的压力，加油，麻麻爱你,0,0
105717,一文酱紫,2018/10/09 16:00,To TYT,你们都是特别的存在，都给我好好加油！TYT冲啊！,1,0
105718,我不是葫芦娃,2018/10/09 12:52,To: 嘉祺,还有小宝现在在长身体不要驼背哦。,0,0
105719,我不是葫芦娃,2018/10/09 12:51,To: 嘉祺,每天都要开开心心的呀! ! !,0,0


references:
1. https://www.cnblogs.com/sanduzxcvbnm/p/10276583.html
2. https://netnewpower.net/?p=45