In [1]:
import numpy as np
import pandas as pd
import requests
import time
import random
from bs4 import BeautifulSoup

### Setting functions

In [2]:
def fetch(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

In [3]:
def fetch_topic(url):
    soup = fetch(url)
    
    topics = soup.find_all('div', class_='title')
    titles = []
    title_links = []
    titles += [topic.text for topic in topics]
    title_links += [topic.a.get('href') for topic in topics]
    
    topics_popularity = soup.find_all('div', class_='meta')
    popularity = []
    popularity += [pop.text.replace('\n','') for pop in topics_popularity]
    
    return titles, title_links, popularity

In [4]:
def fetch_article_info(soup):
    articles = soup.find_all('h2', class_='post-title')
    article_title = []
    article_title += [article.text.replace('\n','') for article in articles]
    article_link = []
    article_link += [article.a.get('href') for article in articles]
    
    article_perfs = soup.find_all(class_='post-meta-items')
    article_perf = []
    article_perf += [article_perf.text.replace('\n','').strip() for article_perf in article_perfs]
    
    article_meta = soup.find_all('div', class_='stream-list-meta')
    online_dt = []
    online_dt += [meta.time.text for meta in article_meta]
    
    return article_title, article_link, article_perf, online_dt

In [5]:
def fetch_topic_n_article_meta(url):
    soup = fetch(url)
    reviews = soup.find_all('button', class_='button')
    good = reviews[0].span.text
    bad = reviews[1].span.text
    
    pageN = soup.find_all('a', class_='page-numbers')
    article_title = []
    article_link = []
    article_perf = []
    online_dt = []
    try:
        N = int(pageN[-2].text)
        article_title, article_link, article_perf, online_dt = fetch_article_info(soup)
        for i in range(2,N+1):
            next_url = url + '/page/' + str(i)
            time.sleep(random.randint(1,3)) #prevent over-crawl
            soup = fetch(next_url)
            a,b,c,d = fetch_article_info(soup)
            article_title += a
            article_link += b
            article_perf += c
            online_dt += d
        
    except:
        article_title, article_link, article_perf, online_dt = fetch_article_info(soup)
        
    return good, bad, article_title, article_link, article_perf, online_dt

## Time to crawl
### 1st step: Crawl topics

In [6]:
url = 'http://www.woshipm.com/topics' #url for 人人都是產品經理
topic, topic_link, popularity = fetch_topic(url)
topics = pd.DataFrame(data = {'topic':topic, 'topic_link':topic_link, 'popularity':popularity})
topics

Unnamed: 0,topic,topic_link,popularity
0,AI 产品经理入门手册,http://www.woshipm.com/topic/ai,共19篇文章36020人已学习进入专题
1,APP设计全方位解析,http://www.woshipm.com/topic/app-2,共27篇文章37098人已学习进入专题
2,APP运营推广实操指南,http://www.woshipm.com/topic/app,共25篇文章69459人已学习进入专题
3,ASO优化从入门到精通,http://www.woshipm.com/topic/aso,共21篇文章20822人已学习进入专题
4,CRM产品该如何设计和运营？,http://www.woshipm.com/topic/crm,共16篇文章27456人已学习进入专题
5,H5有了这些新玩法，还怕没灵感？,http://www.woshipm.com/topic/h5,共27篇文章22783人已学习进入专题
6,K12在线教育产品该如何设计和运营？,http://www.woshipm.com/topic/k12,共20篇文章25157人已学习进入专题
7,SaaS产品该如何设计和运营？,http://www.woshipm.com/topic/saas,共18篇文章27567人已学习进入专题
8,SEO/SEM怎么做？,http://www.woshipm.com/topic/seo,共12篇文章19117人已学习进入专题
9,TO B产品应如何设计和运营？,http://www.woshipm.com/topic/tob,共23篇文章55368人已学习进入专题


### 2nd step: crawl articles under all topics

In [7]:
article_meta = pd.DataFrame(columns=['topic_link', 'good', 'bad', 'article_title', 'article_link', 'article_perf', 'online_dt'])
for link in topic_link:
    good, bad, article_title, article_link, article_perf, online_dt = fetch_topic_n_article_meta(link)
    tmp = pd.DataFrame({'topic_link':link, 'good':good, 'bad':bad, 'article_title':article_title, 'article_link':article_link, 'article_perf':article_perf, 'online_dt':online_dt})
    article_meta = article_meta.append(tmp, ignore_index=True)
    print(link) #ensure they are still working
    time.sleep(random.randint(5,10)) #prevent over-crawl

http://www.woshipm.com/topic/ai
http://www.woshipm.com/topic/app-2
http://www.woshipm.com/topic/app
http://www.woshipm.com/topic/aso
http://www.woshipm.com/topic/crm
http://www.woshipm.com/topic/h5
http://www.woshipm.com/topic/k12
http://www.woshipm.com/topic/saas
http://www.woshipm.com/topic/seo
http://www.woshipm.com/topic/tob
http://www.woshipm.com/topic/structure
http://www.woshipm.com/topic/project
http://www.woshipm.com/topic/push
http://www.woshipm.com/topic/marketing
http://www.woshipm.com/topic/profit
http://www.woshipm.com/topic/finance
http://www.woshipm.com/topic/cpfx
http://www.woshipm.com/topic/dynamic-effect
http://www.woshipm.com/topic/%e4%ba%a7%e5%93%81%e6%80%9d%e7%bb%b4%e9%82%a3%e4%ba%9b%e4%ba%8b%e5%84%bf
http://www.woshipm.com/topic/search
http://www.woshipm.com/topic/newbie
http://www.woshipm.com/topic/flowchart
http://www.woshipm.com/topic/test
http://www.woshipm.com/topic/%e4%ba%a7%e5%93%81%e7%9a%84%e5%95%86%e4%b8%9a%e6%a8%a1%e5%bc%8f
http://www.woshipm.com/topic/

In [8]:
article_meta #show the result

Unnamed: 0,topic_link,good,bad,article_title,article_link,article_perf,online_dt
0,http://www.woshipm.com/topic/ai,437,19,AI产品经理入门手册（下）,http://www.woshipm.com/pmd/1510018.html,4万 255 108,2018/10/15
1,http://www.woshipm.com/topic/ai,437,19,AI产品经理入门手册（上）,http://www.woshipm.com/pmd/1501194.html,10.6万 453 201,2018/10/13
2,http://www.woshipm.com/topic/ai,437,19,AI产品经理，如何面对数据挖掘？,http://www.woshipm.com/pmd/1349778.html,2.5万 86 21,2018/09/03
3,http://www.woshipm.com/topic/ai,437,19,实战分享｜作为AI小白，如何争取AI产品经理offer,http://www.woshipm.com/pmd/1078932.html,3万 285 62,2018/07/02
4,http://www.woshipm.com/topic/ai,437,19,AI产品经理需了解的技术知识：语音识别技术（1）,http://www.woshipm.com/ai/1055549.html,1.1万 105 13,2018/06/19
5,http://www.woshipm.com/topic/ai,437,19,成为AI产品经理之前，可以先读下这篇文章,http://www.woshipm.com/ai/1043591.html,4.2万 291 100,2018/05/30
6,http://www.woshipm.com/topic/ai,437,19,入行AI产品经理，持续增强的学习方法,http://www.woshipm.com/pmd/1027908.html,2万 87 24,2018/05/16
7,http://www.woshipm.com/topic/ai,437,19,系统总结：AI产品经理知识体系,http://www.woshipm.com/pmd/922612.html,1.3万 201 23,2018/01/26
8,http://www.woshipm.com/topic/ai,437,19,AI产品经理的必修课：系统化思维,http://www.woshipm.com/pmd/900264.html,1.9万 164 34,2018/01/09
9,http://www.woshipm.com/topic/ai,437,19,人脸识别：AI产品经理需要了解的CV通识,http://www.woshipm.com/pd/900422.html,2.5万 163 13,2018/01/08


### Save raw data as csv file

In [9]:
article_meta.to_csv('article_meta.csv', index = False)
topics.to_csv('topics.csv', index = False)

---

## Start to ETL

In [10]:
article_meta = pd.read_csv('article_meta.csv')
topics = pd.read_csv('topics.csv')

In [11]:
topics['articleNum'] = topics['popularity'].str.split('篇文章', expand = True)[0].str.replace('共','').astype('int')
topics['topicClicks'] = topics['popularity'].str.split('篇文章', expand = True)[1].str.replace('人已学习进入专题','').astype('int')
topics = topics.drop('popularity', axis = 1)

In [12]:
article_meta['articleClicks'] = article_meta['article_perf'].str.split(expand=True)[0].apply(lambda x: float(x.replace('万',''))*10000 if '万' in x else float(x))
article_meta['articleCollect'] = article_meta['article_perf'].str.split(expand=True)[1].astype('int')
article_meta['articleLike'] = article_meta['article_perf'].str.split(expand=True)[2].astype('int')
article_meta['online_dt'] = pd.to_datetime(article_meta['online_dt'])
article_meta = article_meta.drop('article_perf', axis = 1)

In [13]:
#join data
abt = pd.merge(topics, article_meta)
abt

Unnamed: 0,topic,topic_link,articleNum,topicClicks,good,bad,article_title,article_link,online_dt,articleClicks,articleCollect,articleLike
0,AI 产品经理入门手册,http://www.woshipm.com/topic/ai,19,36020,437,19,AI产品经理入门手册（下）,http://www.woshipm.com/pmd/1510018.html,2018-10-15,40000.0,255,108
1,AI 产品经理入门手册,http://www.woshipm.com/topic/ai,19,36020,437,19,AI产品经理入门手册（上）,http://www.woshipm.com/pmd/1501194.html,2018-10-13,106000.0,453,201
2,AI 产品经理入门手册,http://www.woshipm.com/topic/ai,19,36020,437,19,AI产品经理，如何面对数据挖掘？,http://www.woshipm.com/pmd/1349778.html,2018-09-03,25000.0,86,21
3,AI 产品经理入门手册,http://www.woshipm.com/topic/ai,19,36020,437,19,实战分享｜作为AI小白，如何争取AI产品经理offer,http://www.woshipm.com/pmd/1078932.html,2018-07-02,30000.0,285,62
4,AI 产品经理入门手册,http://www.woshipm.com/topic/ai,19,36020,437,19,AI产品经理需了解的技术知识：语音识别技术（1）,http://www.woshipm.com/ai/1055549.html,2018-06-19,11000.0,105,13
5,AI 产品经理入门手册,http://www.woshipm.com/topic/ai,19,36020,437,19,成为AI产品经理之前，可以先读下这篇文章,http://www.woshipm.com/ai/1043591.html,2018-05-30,42000.0,291,100
6,AI 产品经理入门手册,http://www.woshipm.com/topic/ai,19,36020,437,19,入行AI产品经理，持续增强的学习方法,http://www.woshipm.com/pmd/1027908.html,2018-05-16,20000.0,87,24
7,AI 产品经理入门手册,http://www.woshipm.com/topic/ai,19,36020,437,19,系统总结：AI产品经理知识体系,http://www.woshipm.com/pmd/922612.html,2018-01-26,13000.0,201,23
8,AI 产品经理入门手册,http://www.woshipm.com/topic/ai,19,36020,437,19,AI产品经理的必修课：系统化思维,http://www.woshipm.com/pmd/900264.html,2018-01-09,19000.0,164,34
9,AI 产品经理入门手册,http://www.woshipm.com/topic/ai,19,36020,437,19,人脸识别：AI产品经理需要了解的CV通识,http://www.woshipm.com/pd/900422.html,2018-01-08,25000.0,163,13


In [14]:
#export abt
abt.to_csv('abt.csv', index=False)