In [1]:
import requests
import time
import pandas as pd

# base url
question_id = 315708915
limit = 20
offset = 0


def assemble_url(question_id, offset, limit = 20):
    return 'https://www.zhihu.com/api/v4/questions/' + str(question_id) + '/answers?include=content&limit=' + \
        str(limit) + '&offset=' + str(offset) + '&platform=desktop&sort_by=default'

def convert_answer_url(question_id, original_answer_url):
    _, answer_with_id = original_answer_url.split('v4/')
    return 'https://www.zhihu.com/question/' + str(question_id) + '/' + answer_with_id

def convert_author_url(author_url_token):
    return 'https://www.zhihu.com/people/' + author_url_token

def author_classifer(auther_gender):
    if auther_gender == 1:
        return '男'
    elif auther_gender == 0:
        return '女'
    else:
        return '未知'

# Mimic browser
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}

# retrive one page to get basic information
url = assemble_url(question_id, offset = 0)
page = requests.get(url, headers=headers)

# retrieve raw source
raw = page.json()
total_reviews = raw['paging']['totals']

# column names
raw['data'][0].keys()

result = {
    'id': [],
    'question_title': [],
    'question_created': [],
    'question_updated_time': [],
    'author_name': [],
    'author_url': [],
    'author_avatar_url': [],
    'author_gender': [],
    'url': [],
    'created_time': [],
    'updated_time': [],
    'content': []
}

# Retrieve data
i = 0
while i < total_reviews:
    
    # print progress
    print('Current page: %d, total pages: %d, percent: %.2f%%' % (i, total_reviews, i * 100 / total_reviews))
    
    url = assemble_url(question_id, offset = i)
    page = requests.get(url, headers=headers)
    raw = page.json()
    
    # retrieve every page
    for data in raw['data']:
        
        # update result
        result['id'].append(data['id'])
        result['question_title'].append(data['question']['title'])
        result['question_created'].append(time.ctime(data['question']['created']))
        result['question_updated_time'].append(time.ctime(data['question']['updated_time']))
        result['author_name'].append(data['author']['name'])
        result['author_url'].append(convert_author_url(data['author']['url_token']))
        result['author_avatar_url'].append(data['author']['avatar_url'])
        result['author_gender'].append(author_classifer(data['author']['gender']))
        result['url'].append(convert_answer_url(question_id, data['url']))
        result['created_time'].append(time.ctime(data['created_time']))
        result['updated_time'].append(time.ctime(data['updated_time']))
        result['content'].append(data['content'])
    
    i += limit
print('Done')

source = pd.DataFrame()
source = source.from_dict(result)


Current page: 0, total pages: 649, percent: 0.00%
Current page: 20, total pages: 649, percent: 3.08%
Current page: 40, total pages: 649, percent: 6.16%
Current page: 60, total pages: 649, percent: 9.24%
Current page: 80, total pages: 649, percent: 12.33%
Current page: 100, total pages: 649, percent: 15.41%
Current page: 120, total pages: 649, percent: 18.49%
Current page: 140, total pages: 649, percent: 21.57%
Current page: 160, total pages: 649, percent: 24.65%
Current page: 180, total pages: 649, percent: 27.73%
Current page: 200, total pages: 649, percent: 30.82%
Current page: 220, total pages: 649, percent: 33.90%
Current page: 240, total pages: 649, percent: 36.98%
Current page: 260, total pages: 649, percent: 40.06%
Current page: 280, total pages: 649, percent: 43.14%
Current page: 300, total pages: 649, percent: 46.22%
Current page: 320, total pages: 649, percent: 49.31%
Current page: 340, total pages: 649, percent: 52.39%
Current page: 360, total pages: 649, percent: 55.47%
Cur

In [2]:
source

Unnamed: 0,id,question_title,question_created,question_updated_time,author_name,author_url,author_avatar_url,author_gender,url,created_time,updated_time,content
0,988062424,21届考研应该怎么安排复习？,Wed Mar 13 02:09:37 2019,Fri May 8 11:10:49 2020,十点半,https://www.zhihu.com/people/wei-feng-lin-89,https://pic1.zhimg.com/v2-1ad6b309cb5f3d4604c2...,男,https://www.zhihu.com/question/315708915/answe...,Tue Jan 28 06:02:19 2020,Sun Mar 22 06:00:01 2020,<h2>21考研看这一篇足够！双非上岸学长呕心沥血，纯干货！</h2><p><b>高能干货预...
1,1233093511,21届考研应该怎么安排复习？,Wed Mar 13 02:09:37 2019,Fri May 8 11:10:49 2020,考研小丸子,https://www.zhihu.com/people/xie-zi-17-35-91,https://pic1.zhimg.com/v2-bc2614a00dff17a352b2...,女,https://www.zhihu.com/question/315708915/answe...,Tue May 19 12:11:36 2020,Tue May 19 12:11:36 2020,<p>2020考研上岸在望的小丸子来了，按规矩先报一下自己情况：20届某垫底985新传一战，...
2,990400661,21届考研应该怎么安排复习？,Wed Mar 13 02:09:37 2019,Fri May 8 11:10:49 2020,知乎考研,https://www.zhihu.com/people/zhi-hu-kao-yan,https://pic1.zhimg.com/v2-0469ee1ce5fbd35921ff...,未知,https://www.zhihu.com/question/315708915/answe...,Thu Jan 30 04:05:30 2020,Thu Jan 30 04:05:30 2020,<p>在决定考研以及确定好目标院校和专业后，就需要对整个考研过程有一个大致的规划，对各科目的...
3,1093507320,21届考研应该怎么安排复习？,Wed Mar 13 02:09:37 2019,Fri May 8 11:10:49 2020,起个名字好难,https://www.zhihu.com/people/qi-ge-ming-zi-hao...,https://pic3.zhimg.com/v2-353f324e31f96cfe6011...,男,https://www.zhihu.com/question/315708915/answe...,Fri Mar 20 22:53:35 2020,Sun Apr 12 12:01:39 2020,<p><b>双非上岸985学长，费心整理自己的复习规划，分阶段，具有可操作性。</b></p...
4,1051844326,21届考研应该怎么安排复习？,Wed Mar 13 02:09:37 2019,Fri May 8 11:10:49 2020,顽鱼,https://www.zhihu.com/people/yu-yan-20-90,https://pic4.zhimg.com/v2-ada3a357313e154640ca...,女,https://www.zhihu.com/question/315708915/answe...,Mon Mar 2 17:22:50 2020,Fri Mar 13 08:43:51 2020,<p>我考的分数是410+，相比于一些大神来说，也不算很高，但我还算比较满意。我来给大家说说...
5,1071222723,21届考研应该怎么安排复习？,Wed Mar 13 02:09:37 2019,Fri May 8 11:10:49 2020,许风倾秋,https://www.zhihu.com/people/xu-feng-qing-qiu,https://pic3.zhimg.com/v2-a1f24f3d2db1bf311f7f...,男,https://www.zhihu.com/question/315708915/answe...,Wed Mar 11 05:57:56 2020,Wed Jun 24 13:12:17 2020,<p>5月更新:已上岸，复试排名前x%，排名很好。祝各位加油，努力！</p><hr/><p>...
6,952665959,21届考研应该怎么安排复习？,Wed Mar 13 02:09:37 2019,Fri May 8 11:10:49 2020,考研老冰棍,https://www.zhihu.com/people/kao-yan-lao-bing-gun,https://pic4.zhimg.com/v2-e8d5ce616b34973b1763...,男,https://www.zhihu.com/question/315708915/answe...,Fri Dec 27 09:51:02 2019,Tue May 26 02:44:56 2020,<p>本文从该用的复习资料和复习时间的安排来讲讲21考研，全文分为英语、数学、政治三个板块。...
7,990313578,21届考研应该怎么安排复习？,Wed Mar 13 02:09:37 2019,Fri May 8 11:10:49 2020,知乎用户,https://www.zhihu.com/people/,https://pic1.zhimg.com/da8e974dc_l.jpg?source=...,男,https://www.zhihu.com/question/315708915/answe...,Thu Jan 30 02:48:56 2020,Wed Jul 1 05:02:33 2020,<h2>考研准备，看这一篇就够了！</h2><p><b>个人介绍：19考研党，</b>已被清...
8,1092642569,21届考研应该怎么安排复习？,Wed Mar 13 02:09:37 2019,Fri May 8 11:10:49 2020,盐选成长计划,https://www.zhihu.com/people/liu-kan-shan-78-51,https://pic2.zhimg.com/v2-01cdd02b9fd7a5746970...,未知,https://www.zhihu.com/question/315708915/answe...,Fri Mar 20 11:10:43 2020,Wed Mar 25 07:27:16 2020,<p>在开始复习之前，我们首先要知道：</p>\n<p><strong>做好全局规划对于准备...
9,1276792986,21届考研应该怎么安排复习？,Wed Mar 13 02:09:37 2019,Fri May 8 11:10:49 2020,林政,https://www.zhihu.com/people/bu-yuan-tou-lu-ya...,https://pic1.zhimg.com/v2-06ec4ec36ead51d21f04...,男,https://www.zhihu.com/question/315708915/answe...,Thu Jun 11 05:19:12 2020,Thu Jun 11 05:24:12 2020,<p>各位同学们好，本人于20考研初试取得390分的成绩，报考学校为复旦大学，初试成绩排名为...


# Analysis

In [3]:
print(source.iloc[0]['question_title'])

21届考研应该怎么安排复习？


In [4]:
source.groupby('author_gender')['id'].count()

author_gender
女     209
未知    242
男     197
Name: id, dtype: int64

### word cloud

In [None]:
import html2text, jieba, warnings
import matplotlib.pyplot as plt
from wordcloud import WordCloud

warnings.filterwarnings("ignore")

# assemble data
print('Content Assembling')
all_word = ''.join(source['content'])
h = html2text.HTML2Text()

# remove html element
print('HTML Element Removing')
h.ignore_links = True
text_without_html = h.handle(all_word)

# remove punctuaction
print('Removing Punctuation')
text_without_punctuation = ''.join(filter(str.isalpha, text_without_html))


# cut word
print('Cutting Words')
seg_list = jieba.cut(text_without_punctuation, cut_all=False)
text_cut = ",".join(seg_list)

# remove stopwords
print('Remove Stopwords')
file = open('cn_stopwords.txt', 'r')
stopwords_raw = file.read()
file.close()
stopwords = stopwords_raw.split('\n')

text_word_list = text_cut.split(',')
text_stopword = [word for word in text_word_list if word not in stopwords]
text = ' '.join(text_stopword)

# plot word cloud
print('Plotting Word Cloud')
wordcloud = WordCloud(width = 1200, height = 1200,
                font_path='FZHTJW.TTF',
                background_color ='white',
                min_font_size = 10).generate(text) 
  
# plot the WordCloud image                        
plt.figure(figsize = (10, 10), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 

Content Assembling
HTML Element Removing
Removing Punctuation
Cutting Words
Remove Stopwords
Plotting Word Cloud
