# Build a Corpus
We are just playing with data that is already available, but in real world there's no data that is ready for you to extract the features.

This noteboook demonstrates how to gather data on your own based on your requirements.

We use  [beautifulsoup4](https://pypi.org/project/beautifulsoup4/) , [newspapaer3k](https://pypi.org/project/newspaper3k/0.2.2/)for this project, you can install the same using the below command


```
pip install beautifulsoup4

pip install newspaper3k
```

Scraping is not encouraged as it creates unncessary burden on servers and you might be prohibited from visiting the website again.




In [0]:
# importing required libraries
!pip install beautifulsoup4
!pip install newspaper3k
import os
import uuid
import atexit
import urllib
import random
import requests
import pandas as pd
from time import sleep, time
from bs4 import BeautifulSoup
from newspaper import Article, ArticleException


In [0]:
# define the url you want to scrape

POCKET_BASE_URL = 'https://getpocket.com/explore/%s'

In [0]:
# getting what is required from the url, you can pin point to headins
# observe the pattern before performing scraping

df = pd.DataFrame(columns=['title', 'excerpt', 'url', 'file_name', "keyword", "category"])

In [0]:
# Make sure to store the data in CSV file
@atexit.register
def save_dataframe():

  dataframe_name = "dataframe_{0}.csv".format(time())
  df.to_csv(dataframe_name, index=False)
  # randomized crawling to prevent from getting banned
categories = list(CATEGORIES.items())
random.shuffle(categories)

In [0]:
for category_name, keywords in categories:
  print("Exploring Category=\"{0}\"".format(category_name))
  for kw in keywords:
# Get trending content from Pocket's explore endpoint
    result = requests.get(POCKET_BASE_URL % urllib.parse.quote_plus(kw))
# Extract the media items
    soup = BeautifulSoup(result.content, "html5lib")
    media_items = soup.find_all(attrs={'class': 'media_item'})
    for item_html in media_items:
        title_html = item_html.find_all(attrs={'class': 'title'})[0]
        title = title_html.text
        url = title_html.a['data-saveurl']
        print("Indexing article: \"{0}\" from \"{1}\"".format(title, url))
        excerpt = item_html.find_all(attrs={'class': 'excerpt'})[0].text
        try:
        article = Article(url)
        article.download()
        article.parse()
        content = article.text
      except ArticleException as e:
        print("Encoutered exception when parsing \"{0}\": \"{1}\"".format(url, str(e)))
        continue
      if not content:
        print("Couldn't extract content from \"{0}\"".format(url))
        continue
      # Save the text file
      file_name = "{0}.txt".format(str(uuid.uuid4()))
      with open('./data/files/{0}'.format(file_name), 'w+') as text_file:
          text_file.write(content)
      # Append the row in our dataframe
      df.loc[len(df)] = [title, excerpt, url, file_name, kw, category_name]
      # Need to sleep in order to not get blocked
      sleep(random.randint(5, 15))