In [2]:
### Reference: https://github.com/gaussic/weibo_wordcloud

import re
import json
import requests
import io

# based on mobile weibo
url_template = "https://m.weibo.cn/api/container/getIndex?type=wb&queryVal={}&containerid=100103type=2%26q%3D{}&page={}"


def clean_text(text):
    """cleaning"""
    dr = re.compile(r'(<)[^>]+>', re.S)
    dd = dr.sub('', text)
    dr = re.compile(r'#[^#]+#', re.S)
    dd = dr.sub('', dd)
    dr = re.compile(r'@[^ ]+ ', re.S)
    dd = dr.sub('', dd)
    return dd.strip()


def fetch_data(query_val, page_id):
    """crawl weibos based on specific keyword and page id"""
    resp = requests.get(url_template.format(query_val, query_val, page_id))
    card_group = json.loads(resp.text)['data']['cards'][0]['card_group']
    print('url：', resp.url, ' --- num_weibos:', len(card_group))

    mblogs = []  # save them into dict
    for card in card_group:
        mblog = card['mblog']
        blog = {'mid': mblog['id'],  # weibo id
                'time': mblog['created_at'], # creation time
                'text': clean_text(mblog['text']),  # text
                'userid': str(mblog['user']['id']),  # user id
                'username': mblog['user']['screen_name'],  # username
                'reposts_count': mblog['reposts_count'],  # retweet
                'comments_count': mblog['comments_count'],  # comment
                'attitudes_count': mblog['attitudes_count']  # like
                }
        mblogs.append(blog)
    return mblogs


def remove_duplication(mblogs):
    """drop duplicates"""
    mid_set = {mblogs[0]['mid']}
    new_blogs = []
    for blog in mblogs[1:]:
        if blog['mid'] not in mid_set:
            new_blogs.append(blog)
            mid_set.add(blog['mid'])
    return new_blogs


def fetch_pages(query_val, page_num):
    """crawl weibos on several pages"""
    mblogs = []
    for page_id in range(1 + page_num + 1):
        try:
            mblogs.extend(fetch_data(query_val, page_id))
        except Exception as e:
            print(e)

    print("before drop duplicates：", len(mblogs))
    mblogs = remove_duplication(mblogs)
    print("after drop duplicates：", len(mblogs))

    # 保存到 result.json 文件中
    with io.open('result_{}.json'.format(query_val), 'w', encoding='utf-8') as fp:
        json.dump(mblogs, fp, ensure_ascii=False, indent=4)
        print("saved in result_{}.json".format(query_val))

In [18]:
mblog = fetch_data('贵州茅台', 2)

url： https://m.weibo.cn/api/container/getIndex?type=wb&queryVal=%E8%B4%B5%E5%B7%9E%E8%8C%85%E5%8F%B0&containerid=100103type=2%26q%3D%E8%B4%B5%E5%B7%9E%E8%8C%85%E5%8F%B0&page=2  --- num_weibos: 9


In [15]:
with io.open('weibo.txt','w',encoding='utf-8') as g:
    for i in range(3):
        output = fetch_data('贵州茅台', i)
        for j in output:
            g.write(str(j))
            g.write('\n')

url： https://m.weibo.cn/api/container/getIndex?type=wb&queryVal=%E8%B4%B5%E5%B7%9E%E8%8C%85%E5%8F%B0&containerid=100103type=2%26q%3D%E8%B4%B5%E5%B7%9E%E8%8C%85%E5%8F%B0&page=0  --- num_weibos: 10
url： https://m.weibo.cn/api/container/getIndex?type=wb&queryVal=%E8%B4%B5%E5%B7%9E%E8%8C%85%E5%8F%B0&containerid=100103type=2%26q%3D%E8%B4%B5%E5%B7%9E%E8%8C%85%E5%8F%B0&page=1  --- num_weibos: 10
url： https://m.weibo.cn/api/container/getIndex?type=wb&queryVal=%E8%B4%B5%E5%B7%9E%E8%8C%85%E5%8F%B0&containerid=100103type=2%26q%3D%E8%B4%B5%E5%B7%9E%E8%8C%85%E5%8F%B0&page=2  --- num_weibos: 9


In [29]:
ss = open('weibo.txt',encoding='utf-8').readlines()

In [36]:
import ast

In [37]:
ast.literal_eval(ss[0].strip())

{'attitudes_count': 0,
 'comments_count': 0,
 'mid': '4260435894927014',
 'reposts_count': 0,
 'text': '20180711/吕老板最辉煌的时代。2012年报。拿下500多万股的贵州茅台。有本事的人。永远有本事。回头再看看当初。现在吕老板掌管中粮集团了。/ \u200b',
 'time': '5分钟前',
 'userid': '1929492224',
 'username': '小黑资产管理'}

In [38]:
#!/share/apps/anaconda3/4.3.1/bin/python

### Reference: https://github.com/gaussic/weibo_wordcloud

import re
import json
import requests
import io

# based on mobile weibo
url_template = "https://m.weibo.cn/api/container/getIndex?type=wb&queryVal={}&containerid=100103type=2%26q%3D{}&page={}"


def clean_text(text):
    """cleaning"""
    dr = re.compile(r'(<)[^>]+>', re.S)
    dd = dr.sub('', text)
    dr = re.compile(r'#[^#]+#', re.S)
    dd = dr.sub('', dd)
    dr = re.compile(r'@[^ ]+ ', re.S)
    dd = dr.sub('', dd)
    return dd.strip()


def fetch_data(query_val, page_id):
    """crawl weibos based on specific keyword and page id"""
    resp = requests.get(url_template.format(query_val, query_val, page_id))
    card_group = json.loads(resp.text)['data']['cards'][0]['card_group']

    mblogs = []  # save them into dict
    for card in card_group:
        mblog = card['mblog']
        blog = {'mid': mblog['id'],  # weibo id
                'time': mblog['created_at'], # creation time
                'text': clean_text(mblog['text']),  # text
                'userid': str(mblog['user']['id']),  # user id
                'username': mblog['user']['screen_name'],  # username
                'reposts_count': mblog['reposts_count'],  # retweet
                'comments_count': mblog['comments_count'],  # comment
                'attitudes_count': mblog['attitudes_count']  # like
                }
        mblogs.append(blog)
    return mblogs


def fetch_pages(query_val, page_num):
    """crawl weibos on several pages"""
    
    for page_id in range(1 + page_num + 1):
        try:
            output = fetch_data(query_val, page_id)
            for j in output:
                print(str(j))
                
        except Exception as e:
            print(e)
            
fetch_pages('贵州茅台',30)

url： https://m.weibo.cn/api/container/getIndex?type=wb&queryVal=%E8%B4%B5%E5%B7%9E%E8%8C%85%E5%8F%B0&containerid=100103type=2%26q%3D%E8%B4%B5%E5%B7%9E%E8%8C%85%E5%8F%B0&page=0  --- num_weibos: 10
{'mid': '4260435894927014', 'time': '38分钟前', 'text': '20180711/吕老板最辉煌的时代。2012年报。拿下500多万股的贵州茅台。有本事的人。永远有本事。回头再看看当初。现在吕老板掌管中粮集团了。/ \u200b', 'userid': '1929492224', 'username': '小黑资产管理', 'reposts_count': 0, 'comments_count': 0, 'attitudes_count': 0}
{'mid': '4260399971306418', 'time': '3小时前', 'text': '深夜重大利好：贵州茅台 天华院 河钢股份  中船科技  $宝莱特 sz300246$ $卫信康 sh603676$ $天鹅股份 sh603029$ $健民集团 sh600976$  姚晓浪的秒拍视频 \u200b', 'userid': '2105239982', 'username': '姚晓浪', 'reposts_count': 0, 'comments_count': 0, 'attitudes_count': 0}
{'mid': '4260393940127069', 'time': '3小时前', 'text': '大家都去做房地产了，股市主板的股王是贵州茅台，卖酒的哎，创业板的股王是温氏股份，养猪的哎。创业板本来是支持新创企业的，像美国的纳斯达克中走出来多少厉害的公司，微软、思科、亚马逊等等。中国的“纳斯达克”股王是养猪的，笑死了，也可悲极了。', 'userid': '3895390620', 'username': '-那颗颗大白菜-', 'reposts_count': 0, 'comments_count': 4, 'attitudes_count': 0}
{'mid

KeyboardInterrupt: 