## [Python] Web Scraping - ThreatPost.com OOized
this code is used to scrap the meta-data of articles and their contents via threatpost.com as an example for testing. <br />
<b>Date:</b> 2017/11/20<br />
<b>Loc:</b> III, Taipei, TAIWAN(R.O.C.)<br />
<b>Coder:</b> <i>Chan, TooDou, Chun-Hsiang</i>

## Import packages

In [1]:
# https://threatpost.com/category/cloud-security/
# import packages
import numpy as np
import pandas as pd
import os
import datetime
import sys
from bs4 import BeautifulSoup
import requests
from urllib.request import Request, urlopen

## Define Class and Functions

In [2]:
class Website():
    def __init__(self, index_site):
        self.index_site = index_site
        self.index_htmlContent = ''
        self.num_page = 0
        self.num_article = 0
        self.page_df = ''
        self.page_info_categories = []
        self.page_info_title = []
        self.page_info_date = []
        self.page_info_author = []
        self.page_info_url = []
        
    def get_index_htmlContents(self):
        # request to open the website via urlopen
        req = Request(self.index_site, headers={'User-Agent': 'Mozilla/5.0'})
        webpage = urlopen(req).read()
        # html parser via beautifulsoup
        soupIndex = BeautifulSoup(webpage, "html.parser")
        self.index_htmlContent = soupIndex
        return self.index_htmlContent
    
    def get_num_page(self):
        # transfer to variable
        soupIndex = self.index_htmlContent
        pageNum = soupIndex.select('.page-numbers')
        num_pageOnSite = len(pageNum)
        self.num_page = int(pageNum[num_pageOnSite-2].text)
        return self.num_page
    
    def get_info_ofEach_page(self, testPrint):
        # transfer to variable
        soupIndex = self.index_htmlContent
        # choose the news part
        full_latest_post = soupIndex.select('#latest-posts')[0]
        # count the number of article
        num_article = len(full_latest_post.select('article'))
        # declare the list for storing the article meta-data
        info_categories = []
        info_title = []
        info_date = []
        info_author = []
        info_url = []
        # parsing the article meta-data to list
        for i in range(num_article):
            # get article
            article_contents = full_latest_post.select('article')[i]
            # parsing and storing
            info_categories.append(article_contents.select('.categories')[0].text[14:-1])
            info_title.append(article_contents.select('.entry-title')[0].text[1:-1])
            info_date.append(article_contents.select('.post-info')[0].text[1:-1])
            info_author.append(article_contents.select('.post-info')[1].text[4:-1])
            info_url.append(article_contents.a['href'])
            
        # save to single dataframe
        threatPost_com = pd.DataFrame(np.column_stack([info_categories, info_title, info_date, info_author, info_url])
                                     , columns=['categories', 'title', 'date', 'author', 'url'])
        # print the head of dataframe
        if testPrint==1:
            threatPost_com.head()
        # save to class attributes
        self.page_df = threatPost_com
        self.page_info_categories = info_categories
        self.page_info_title = info_title
        self.page_info_date = info_date
        self.page_info_author = info_author
        self.page_info_url = info_url
        
        return threatPost_com, info_categories, info_title, info_date, info_author, info_url  

## Obtain all article info from the index.html

In [3]:
# set the target url
target_url = 'https://threatpost.com/category/cloud-security/'

In [4]:
# set the class
threatComWeb = Website(target_url)
# compute
Index_html_Contents = threatComWeb.get_index_htmlContents()
Index_MetaData = threatComWeb.get_info_ofEach_page(0)
# the contents on index.html
df = threatComWeb.page_df

## Count number of sub-Websites

In [5]:
# how many pages in this categories
num_page = threatComWeb.get_num_page()
print(num_page)

20


## Scrap all meta-info of articles

In [6]:
for i in range(1,num_page+1):
    if i == 1:
        target_url_others = target_url
    else:
        target_url_others = target_url + 'page/' + str(i) + '/'
    # set the class
    threatComWeb_others = Website(target_url_others)
    # compute
    Index_html_Contents_others = threatComWeb_others.get_index_htmlContents()
    Index_MetaData_others = threatComWeb_others.get_info_ofEach_page(0)
    # the contents on index.html
    if i == 1:
        af = threatComWeb_others.page_df
    else:
        df = threatComWeb_others.page_df
        af = pd.concat([af, df], ignore_index=True)

In [7]:
af.head()

Unnamed: 0,categories,title,date,author,url
0,"Cloud Security, Hacks, Privacy",Amazon Promises Fix to Stop Key Service Hack,"November 17, 2017 , 1:59 pm",Tom Spring,https://threatpost.com/amazon-promises-fix-for...
1,"Cloud Security, Privacy, Web Security",Data Pours from Cloud—And ‘The Enemy is Us’,"November 6, 2017 , 8:00 am",Tom Spring,https://threatpost.com/data-pours-from-cloud-a...
2,"Cloud Security, Hacks, Privacy, Vulnerabili...",WordPress Delivers Second Patch For SQL Inject...,"November 1, 2017 , 2:35 pm",Tom Spring,https://threatpost.com/wordpress-delivers-seco...
3,"Cloud Security, Featured, Privacy, Vulnerab...",Slack Plugs ‘Severe’ SAML User Authentication ...,"October 27, 2017 , 8:00 am",Tom Spring,https://threatpost.com/slack-plugs-severe-saml...
4,"Cloud Security, Cryptography, Vulnerabilitie...",Cisco Warns 69 Products Impacted by KRACK,"October 20, 2017 , 7:00 am",Tom Spring,https://threatpost.com/cisco-warns-69-products...


## Export meta-info article data

In [8]:
af.to_csv('threatpost_com_metaInfo.csv')
af.to_pickle('threatpost_com_metaInfo.pkl')