In [1]:
from jupyterthemes import jtplot
jtplot.style()
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from datetime import datetime

In [3]:
url = 'https://blockonomi.com/category/news/'

# 1. Find Links of Different News Category on Blockonomi

In [4]:
page1 = requests.get(url)
soup1 = BeautifulSoup(page1.content, 'html.parser')

In [5]:
# website link
list_category_link = [i.get('href') for i in soup1.find(class_='widget widget_categories').find_all('a')]
# categories name
list_category_name = [i.get_text() for i in soup1.find(class_='widget widget_categories').find_all('a')]

In [6]:
CategoryDic = dict(zip(list_category_name, list_category_link))

In [7]:
CategoryDic

{'Buying': 'https://blockonomi.com/category/buying/',
 'Exchanges': 'https://blockonomi.com/category/exchanges/',
 'Fundamentals': 'https://blockonomi.com/category/fundamentals/',
 'Gaming': 'https://blockonomi.com/category/gaming/',
 'Guides': 'https://blockonomi.com/category/guides/',
 'History': 'https://blockonomi.com/category/history/',
 'ICO': 'https://blockonomi.com/category/ico/',
 'Mining': 'https://blockonomi.com/category/mining/',
 'News': 'https://blockonomi.com/category/news/',
 'Press Release': 'https://blockonomi.com/category/press-release/',
 'Trading': 'https://blockonomi.com/category/trading/',
 'Wallets': 'https://blockonomi.com/category/wallets/'}

# 2. Find Maximam Page Number of Each Category

In [8]:
NumMaxPage = {}

In [9]:
for i in list_category_name:
    page2 = requests.get(CategoryDic.get(i))
    soup2 = BeautifulSoup(page2.content, 'html.parser')
    try:
        NumMaxPage[i] = int(soup2.find(class_='page-navigation').get_text().split()[-2])
    except AttributeError:
        NumMaxPage[i] = 1

In [10]:
NumMaxPage

{'Buying': 2,
 'Exchanges': 5,
 'Fundamentals': 14,
 'Gaming': 2,
 'Guides': 15,
 'History': 3,
 'ICO': 7,
 'Mining': 5,
 'News': 7,
 'Press Release': 23,
 'Trading': 1,
 'Wallets': 2}

# 3. Find All the Articles Links on One Page

In [11]:
def FindArticleLink_blockonomi(myurl):
    """
    return the all the article links in one page
    linklist: list
    """
    page = requests.get(myurl)
    soup = BeautifulSoup(page.content, 'html.parser')
    linklist = [item.get('href') for item in soup.find_all(class_='grid-thumb-image')]
    return linklist

# 4. Store All Links

In [13]:
AllLink = {} #store all links 

for key, value in NumMaxPage.items():
   # for test 
    """if key != 'History':
        continue"""
    # some categories have no pages    
    """if value == 0:
        continue"""

    for i in range(1, value + 1):
        AllLink[key + ' page ' + str(i)] = FindArticleLink_blockonomi(CategoryDic.get(key) +
                                                                      'page/' + str(i))

In [14]:
AllLink

{'Buying page 1': ['https://blockonomi.com/dollar-cost-averaging/',
  'https://blockonomi.com/coinbase-alternatives/',
  'https://blockonomi.com/buy-bitcoin-india/',
  'https://blockonomi.com/investing-cryptocurrency/',
  'https://blockonomi.com/buy-bitcoin-anonymously/',
  'https://blockonomi.com/bitcoin-atms/',
  'https://blockonomi.com/ico-guide/',
  'https://blockonomi.com/buy-bitcoin-with-paypal/',
  'https://blockonomi.com/buy-bitcoin/',
  'https://blockonomi.com/buy-ethereum/'],
 'Buying page 2': ['https://blockonomi.com/cryptocurrency-exchanges/'],
 'Exchanges page 1': ['https://blockonomi.com/huobi-review/',
  'https://blockonomi.com/bibox-review/',
  'https://blockonomi.com/hodly-review/',
  'https://blockonomi.com/okex-review/',
  'https://blockonomi.com/hotbit-review/',
  'https://blockonomi.com/tidex-review/',
  'https://blockonomi.com/virwox-review/',
  'https://blockonomi.com/idex-review/',
  'https://blockonomi.com/binance-review/',
  'https://blockonomi.com/exchanges-t

# 5. Write all Links to Excel

In [15]:
ALLlink_list = []
for key, values in AllLink.items():
    ALLlink_list = ALLlink_list + values

In [15]:
len(ALLlink_list)

793

In [16]:
df = pd.DataFrame(ALLlink_list, columns=['Blockonomi'])

df.to_csv('Blockonomi_Link.csv')

# 6. Write all Info to Excel

## Function to scrape info from Blockonomi

In [121]:
def Scrap_blockonomi(myurl):
    """
    return: title; date; number of views; Tag names; number of shares
            can not find author and email info
    
    """
    page = requests.get(myurl)
    soup = BeautifulSoup(page.content, 'html.parser')
    Info = soup.find(class_='hero-date-span').get_text().split('/')
    Title = soup.find(class_='entry-title').get_text()
    Comments = Info[1].split()[0]
    Date = Info[0]
    Author = soup.find(class_='hero-date').get_text().split()[0]
    Views = soup.find(class_='post-views-count').get_text()
    try: 
        TagName = soup.find(class_='meta-tag').get_text().split(':')
    except AttributeError:
        TagName = np.NaN
    try:
        Shares = {item.get_text() for item in soup.find_all(class_='essb_network_name')}
        Total_shares = soup.find(class_='essb_t_nb').get_text()[:-6]
    except AttributeError:
        Total_shares = '0'
        Shares = {'Subscribe', 'Buffer0', 'Facebook0', 'LinkedIn0', 'Twitter0', 'Google+0'}
    return Title, Author, Date, TagName, Views, Comments, Shares, Total_shares

def DateFormat(mydate):
    return datetime.strptime(mydate, '%B %d, %Y').strftime("%Y-%m-%d")


def Share_Info(myset):
    mydict = dict()
    myset.discard('Subscribe')
    for i in myset:
        if "Buffer" in i:
            mydict['Buffer'] = i[len('Buffer'): ]
        elif 'Facebook' in i:
            mydict['Facebook'] = i[len('Facebook'): ]
        elif 'LinkedIn' in i:
            mydict['LinkedIn'] = i[len('LinkedIn'): ]
        elif 'Twitter' in i:
            mydict['Twitter'] = i[len('Twitter'): ]
        elif 'Google+' in i:
            mydict['Google+'] = i[len('Google+'): ]
    return mydict

## Write Info to Excel

In [116]:
testlink = ALLlink_list[395: 404]
testlink
len(ALLlink_list)

802

In [126]:
column_name = ['URL', 'Article Name', 'Author Name', 'Email of Author', 'Post Date', 'Tag',
               'Total Likes', 'Number of Views', 'Number of Comments', 'Total Shares', 
               'Facebook', 'Twitter', 'Reddit', 'Google plus', 'Linkedin', 'Telegram', 'Pinterest',
               'Buffer', 'Digg']
df = pd.DataFrame(columns=column_name, index=range(len(ALLlink_list)))

In [128]:
for i in range(len(ALLlink_list)):
    mydata = Scrap_blockonomi(ALLlink_list[i])
    article_name = mydata[0]
    author_name = mydata[1]
    post_date = DateFormat(mydata[2][:-1])
    try:
        tag_name = mydata[3][1]
    except TypeError:
        tag_name = np.NAN
    number_views = int(mydata[4].replace(',', ''))
    number_comment = int(mydata[5])
    share_info_dict = Share_Info(mydata[6])
    total_shares = mydata[7]
    
    """
    df.iloc[i] = [tag_name]
    """
    
    df.iloc[i] = [ALLlink_list[i], article_name, author_name, np.NAN, post_date, 
                  tag_name, np.NAN, number_views, number_comment, total_shares,
                  share_info_dict.get('Facebook'), share_info_dict.get('Twitter'),
                  np.NAN, share_info_dict.get('Google+'), share_info_dict.get('LinkedIn'),
                  np.NAN, np.NAN, share_info_dict.get('Buffer'), np.NAN]   
    print(i, 'Done')

0 Done
1 Done
2 Done
3 Done
4 Done
5 Done
6 Done
7 Done
8 Done
9 Done
10 Done
11 Done
12 Done
13 Done
14 Done
15 Done
16 Done
17 Done
18 Done
19 Done
20 Done
21 Done
22 Done
23 Done
24 Done
25 Done
26 Done
27 Done
28 Done
29 Done
30 Done
31 Done
32 Done
33 Done
34 Done
35 Done
36 Done
37 Done
38 Done
39 Done
40 Done
41 Done
42 Done
43 Done
44 Done
45 Done
46 Done
47 Done
48 Done
49 Done
50 Done
51 Done
52 Done
53 Done
54 Done
55 Done
56 Done
57 Done
58 Done
59 Done
60 Done
61 Done
62 Done
63 Done
64 Done
65 Done
66 Done
67 Done
68 Done
69 Done
70 Done
71 Done
72 Done
73 Done
74 Done
75 Done
76 Done
77 Done
78 Done
79 Done
80 Done
81 Done
82 Done
83 Done
84 Done
85 Done
86 Done
87 Done
88 Done
89 Done
90 Done
91 Done
92 Done
93 Done
94 Done
95 Done
96 Done
97 Done
98 Done
99 Done
100 Done
101 Done
102 Done
103 Done
104 Done
105 Done
106 Done
107 Done
108 Done
109 Done
110 Done
111 Done
112 Done
113 Done
114 Done
115 Done
116 Done
117 Done
118 Done
119 Done
120 Done
121 Done
122 Done
123

In [129]:
df

Unnamed: 0,URL,Article Name,Author Name,Email of Author,Post Date,Tag,Total Likes,Number of Views,Number of Comments,Total Shares,Facebook,Twitter,Reddit,Google plus,Linkedin,Telegram,Pinterest,Buffer,Digg
0,https://blockonomi.com/dollar-cost-averaging/,Cryptocurrency Investing Using the Dollar Cost...,Oliver,,2018-05-10,"DCA, Dollar Cost Averaging, Investing in Crypt...",,1935,0,24,7,4,,12,0,,,1,
1,https://blockonomi.com/coinbase-alternatives/,Alternatives to Coinbase: Buy & Sell Cryptocur...,Louis,,2018-03-05,"Alternative to Coinbase, Buy Cryptocurrency In...",,6693,0,12,1,4,,6,0,,,1,
2,https://blockonomi.com/buy-bitcoin-india/,How to Buy Bitcoin in India: Guide to the Best...,Max,,2018-02-27,"bitcoin, BTC, Buy Bitcoin in India, Buy Bitcoi...",,2074,0,9,7,1,,0,0,,,1,
3,https://blockonomi.com/investing-cryptocurrency/,Investing In Cryptocurrency: Complete Beginner...,William,,2017-12-11,"Crypto investing, Crypto strategy, cryptocurre...",,10305,1,35,21,7,,6,1,,,0,
4,https://blockonomi.com/buy-bitcoin-anonymously/,How to Buy Bitcoin Anonymously Without I.D,William,,2017-12-06,"buy bitcoin, buy bitcoin anonymously, buy bito...",,30367,7,6,2,2,,2,0,,,0,
5,https://blockonomi.com/bitcoin-atms/,Guide to Bitcoin ATMs: A Brief Primer on Buyin...,William,,2017-11-24,"ATM, bitcoin, Bitcoin ATM, Bitcoin ATM Near Me",,3606,0,17,2,6,,5,4,,,0,
6,https://blockonomi.com/ico-guide/,Beginner’s Guide to ICOs: 7 Tips for Picking W...,William,,2017-11-29,"ico, ICO investing",,9409,1,41,11,4,,8,18,,,0,
7,https://blockonomi.com/buy-bitcoin-with-paypal/,How to Buy Bitcoin With PayPal,Oliver,,2018-01-03,"bitcoin, How to Buy Bitcoin With With PayPal, ...",,58479,6,26,7,7,,9,3,,,0,
8,https://blockonomi.com/buy-bitcoin/,How to Buy Bitcoin Instantly using a Credit or...,Oliver,,2018-01-03,"buy bitcoin credit card, buy bitcoin debit car...",,19696,1,15,8,5,,2,0,,,0,
9,https://blockonomi.com/buy-ethereum/,How to Buy Ethereum Instantly using a Credit o...,Oliver,,2018-01-03,"buy ethereum, buy ethereum with credit card, H...",,76586,10,14,2,4,,8,0,,,0,


In [131]:
df.to_csv('Blockonomi_Info.csv')