In [11]:
# -*- coding: utf-8 -*-
import requests
import time
import pandas as pd
class WeChatArticlesInfo(object):
    """
    获取微信公众号的推文信息
    """
    COLUMNS = ['aid', 'title', 'cover_url', 'abstract', 'url', 'time']
    INFO_KEY = "app_msg_list"
    COUNT_KEY = "app_msg_cnt"
    def __init__(self, cookie:str, token:str):
        """
        Parameter:
            cookie: str
            token: str
        """
        self.sess = requests.session()
        self.search_url = "https://mp.weixin.qq.com/cgi-bin/searchbiz"
        self.appmsg_url = "https://mp.weixin.qq.com/cgi-bin/appmsg"
        self.headers = {
            "Cookie": cookie,
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36"
        }
        self.params = {
            "lang": "zh_CN",
            "f": "json",
            "token": token
        }
        self.fakeid = {}

    def get_basic_info(self, name:str, begin:int=0, count:int=5):
        """
        查找公众号基础信息，主要用于获取fakeid进行下一步的获取
        Parameter:
            name: str, 公众号名称，尽可能准确
            begin: int, 起始页数
            count: int, 数量，1-5

        Return:
            list
        """
        params = {
            "query": name,
            "count": str(count),
            "action": "search_biz",
            "ajax": "1",
            "begin": str(begin)
        }
        self.params.update(params)
        # 公众号名称一定要准确，否则排序上会出问题
        result = self.sess.get(self.search_url,
                                 headers=self.headers,
                                 params=self.params)
        return result.json()["list"]

    def __get_articles_info(self, name:str, begin:int, count:int=5):
        """
        获取公众号文章的信息，包含url(key: link)可用于后续爬取
        Parameters:
            name: str, 公众号名称，尽可能准确
            begin: int, 起始页数
            count: int, 数量，1-5
        Returns
            json
            important keys:
            "app_msg_list": 公众号文章信息
            "app_msg_cnt": 公众号文章总数
        """
        fakeid = self.fakeid.get(name, None)
        if fakeid is None:
            self.fakeid[name] = self.get_basic_info(name)[0]["fakeid"]
            fakeid = self.fakeid[name]
        params = {
            "fakeid": fakeid,
            "query": "",
            "begin": str(begin),
            "count": str(count),
            "type": "9",
            "action": "list_ex"
        }
        self.params.update(params)
        data = self.sess.get(self.appmsg_url, headers=self.headers, params=self.params)
        return data.json()
    
    def get_articles_info_step(self, name:str, begin:int=0, count:int=5):
        """
        查找并提取文章的信息
        Parameter:
            name: str, 公众号名称，尽可能准确
            begin: int, 起始页数
            count: int, 数量，1-5

        Return:
            list
        """
        data = self.__get_articles_info(name=name, begin=begin, count=count)[WeChatArticlesInfo.INFO_KEY]
        map_dict = {'aid': 'aid',
                    'title': 'title', 
                    'cover_url': 'cover',
                    'abstract': 'digest',
                    'url': 'link', 
                    'time': 'update_time'}
        data = [{key: x[map_dict[key]] for key in map_dict} for x in data]
        return data
    
    def get_articles_info(self, name:str, begin:int=0, count:int=15):
        """
        查找并提取文章的信息
        Parameter:
            name: str, 公众号名称，尽可能准确
            begin: int, 起始页数
            count: int, 数量
            endtimestamp: int，终止的timestamp

        Return:
            pd.DataFrame
        """
        nsteps = int(count / 5) + 1
        step = 0
        data = []
        while 5 * step < count:
            data.extend(self.get_articles_info_step(name, 5*step, 5))
            step = step + 1
            #time.sleep(0.01)
        return pd.DataFrame.from_records(data)

cookie = "appmsglist_action_3885044594=card; pgv_pvid=9077891912; pgv_pvi=2132619264; RK=yFyYv1ZTW1; ptcz=43a2e5542ef329277f931eca3fc8448e476d0396534226771eeb312b19b7d460; ua_id=6pa7LEid8uNGYVXmAAAAAHSKaOBDMqhtf9ENX0Ig2MI=; mm_lang=zh_CN; eas_sid=j135z8e1P57130610504j4c126; tvfe_boss_uuid=ba8bb21b5b9dd930; o_cookie=971813059; pac_uid=1_971813059; ied_qq=o0971813059; openid2ticket_oG3em0cZw6NcqvCbQkhZ-oPMqXR8=IYJnlAgO8rEbqqW5ENs+aZ+y7J1DYGAnl1ben4vlfZc=; openid2ticket_ochOR50hKgGUzeW0YjvmEs1Q39p8=cpHBN07q/9nsGgs8gO5fOjT3p72yZNUcLXnjH0JhPeI=; iip=0; uin_cookie=o0971813059; pgv_si=s8149891072; uuid=76c7189db778fd2ac5f7d9226f4a4d5b; rand_info=CAESIAJN3GSOFoB5f9Xts2/SzLfY9zYY2uC2fcFekzMotxKG; slave_bizuin=3885044594; data_bizuin=3885044594; bizuin=3885044594; data_ticket=n54Gys8SQ+G92yVd7XQ79hlCIAlwQ7PpWnBVkXx9R3TNDLXRrnyVDSoQk5aemzu4; slave_sid=YzJPc1V0RjlPclVpMjlEVWZjSlpWdHlOUHhDVHgxY3dRbExzMEJuenlwYk1nUmRRV2kwUzlhT21icVlxNmNHN2dXbVN0WE8xdVVlYV9LZjlJdEthVG5MU09IU0F0VFpkVDBoSHVTeE9ZQ25wWEVpbkdIUTZlVFVEUVpUVDZySzRuOVBjeE52a0VzYTlORVJG; slave_user=gh_3aea65724b37; xid=; rewardsn=; wxtokenkey=777"
token = "1017557221"

In [12]:
wechatarticle = WeChatArticlesInfo(cookie=cookie, token=token)

In [13]:
result = wechatarticle.get_articles_info("清华大学", begin=0, count=20)

In [14]:
result.head()

Unnamed: 0,aid,title,cover_url,abstract,url,time
0,2659222162_1,早知道,https://mmbiz.qlogo.cn/mmbiz_jpg/HhoEMZZMsiaRk...,,http://mp.weixin.qq.com/s?__biz=MzA4OTIyMzgxMw...,1602460860
1,2659222161_1,自强奋进，加快建设世界一流大学创新体系 | 清华大学第十八次科研工作讨论会闭幕,https://mmbiz.qlogo.cn/mmbiz_jpg/HhoEMZZMsiaR1...,,http://mp.weixin.qq.com/s?__biz=MzA4OTIyMzgxMw...,1602422693
2,2659222118_1,全面深化研究生教育改革，全面提升高层次人才培养能力 | 清华大学举行研究生教育改革发展大会,https://mmbiz.qlogo.cn/mmbiz_jpg/HhoEMZZMsiaRk...,,http://mp.weixin.qq.com/s?__biz=MzA4OTIyMzgxMw...,1602293328
3,2659222111_1,清华秋日物语,https://mmbiz.qlogo.cn/mmbiz_jpg/HhoEMZZMsiaSP...,秋天来了，你准备好了吗？,http://mp.weixin.qq.com/s?__biz=MzA4OTIyMzgxMw...,1602236339
4,2659222063_1,清华毕业生都去哪儿了？,https://mmbiz.qlogo.cn/mmbiz_jpg/HhoEMZZMsiaTh...,清华大学毕业生就业情况小测试！,http://mp.weixin.qq.com/s?__biz=MzA4OTIyMzgxMw...,1602162765
