### **Installations**

In [None]:
import os
os.chdir("/content/drive/My Drive/Colab Notebooks/Capstone/Crawling medium/scrape_all")

In [None]:
!pip install fake-useragent

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
# from get_proxies import get_proxies
# from get_name import get_name
import requests
import os.path
from os import path
import calendar

### **Functions to scrape**
- ```get_name```: convert url to appropriate name for storage purposes
- ```scraper```: tool to scrape with Medium publisher URL and the year you want to scrape, the scraper will crawl everyday in that year for 10 posts each. 

In [None]:
def get_name(test_str):

    # initializing punctuations string  
    punc = '''!()-[]{};:'"\, <>./?@#$%^&*_~ '''
    
    # Removing punctuations in string 
    # Using loop + punctuation string 
    for ele in test_str:  
        if ele in punc:  
            test_str = test_str.replace(ele, "")  

    # printing result  
    return test_str

In [None]:
def scraper(url = 'https://towardsdatascience.com', year = 2020):
    '''
    scrape all year content (10 posts/day)
    url: the url for the publishers on Medium
    '''

    # file storing
    name = get_name(url)
    name = f'./storage/{name}{year}.csv'
    print(f"File exists: {name}: " + str(path.exists(name)))

    if path.exists(name):
        return name

    # crawling
    ua = UserAgent()

    stories_data = []

    core_url = url

    # loop through the months
    for month in range(1, 13):
        num_days = calendar.monthrange(year, month)[1]

        month = str(month)

        if len(month) == 1:
            month = f'0{month}'

        # looping through the dates
        for day in range(1, num_days + 1):
            day = str(day)

            if len(day) == 1:
                day = f'0{day}'

            date = f'{day}/{month}/{year}'
            url = f'{core_url}/archive/{year}/{month}/{day}'
            print(url)

            headers = {'User-Agent': ua.random}
            
            # crawling with fake user agent
            page = requests.get(url, headers = headers)#, allow_redirects=False)
            soup = BeautifulSoup(page.text, 'html.parser')

            stories = soup.find_all('div', class_='streamItem streamItem--postPreview js-streamItem')
            if len(stories) > 10:
              stories = stories[:10]
              
            # extract keep information for each post
            for story in stories:
                each_story = []

                author_box = story.find('div', class_='postMetaInline u-floatLeft u-sm-maxWidthFullWidth')
                author_url = author_box.find('a')['href']

                try:
                    reading_time = author_box.find('span', class_='readingTime')['title']
                except:
                    continue

                title = story.find('h3').text if story.find('h3') else '-'

                subtitle = story.find('h4').text if story.find('h4') else '-'

                if story.find('button', class_='button button--chromeless u-baseColor--buttonNormal'
                                                ' js-multirecommendCountButton u-disablePointerEvents'):

                    claps = story.find('button', class_='button button--chromeless u-baseColor--buttonNormal'
                                                        ' js-multirecommendCountButton u-disablePointerEvents').text
                else:
                    claps = 0

                if story.find('a', class_='button button--chromeless u-baseColor--buttonNormal'):
                    responses = story.find('a', class_='button button--chromeless u-baseColor--buttonNormal').text
                else:
                    responses = '0 responses'

                story_url = story.find('a', class_='button button--smaller button--chromeless u-baseColor--buttonNormal')[
                    'href']

                reading_time = reading_time.split()[0]
                responses = responses.split()[0]

                story_page = requests.get(story_url)
                story_soup = BeautifulSoup(story_page.text, 'html.parser')

                sections = story_soup.find_all('section')
                story_paragraphs = []
                section_titles = []
                for section in sections:
                    paragraphs = section.find_all('p')
                    for paragraph in paragraphs:
                        story_paragraphs.append(paragraph.text)

                    subs = section.find_all('h1')
                    for sub in subs:
                        section_titles.append(sub.text)

                number_sections = len(section_titles)
                number_paragraphs = len(story_paragraphs)

                each_story.append(date)
                each_story.append(title)
                each_story.append(subtitle)
                each_story.append(claps)
                each_story.append(responses)
                each_story.append(author_url)
                each_story.append(story_url)
                each_story.append(reading_time)
                each_story.append(number_sections)
                each_story.append(section_titles)
                each_story.append(number_paragraphs)
                each_story.append(story_paragraphs)

                stories_data.append(each_story)

            print(f'{len(stories_data)} stories scraped so far.')

    # storing all in a .csv file
    columns = ['date', 'title', 'subtitle', 'claps', 'responses', 'author_url', 'story_url',
               'reading_time (mins)', 'number_sections', 'section_titles', 'number_paragraphs', 'paragraphs']

    df = pd.DataFrame(stories_data, columns=columns)

    df.to_csv(name, index=False)
    return name

In [None]:
url = 'https://towardsdatascience.com'
years = [2019, 2020]

In [None]:
# scraping for 2019 and 2020
for year in years:
  print(url, year)
  scraper(url, year)
  print("##################################################")
  print("")

https://towardsdatascience.com 2019
File exists: ./storage/httpstowardsdatasciencecom2019.csv: False
https://towardsdatascience.com/archive/2019/01/01
10 stories scraped so far.
https://towardsdatascience.com/archive/2019/01/02
20 stories scraped so far.
https://towardsdatascience.com/archive/2019/01/03
30 stories scraped so far.
https://towardsdatascience.com/archive/2019/01/04
40 stories scraped so far.
https://towardsdatascience.com/archive/2019/01/05
50 stories scraped so far.
https://towardsdatascience.com/archive/2019/01/06
60 stories scraped so far.
https://towardsdatascience.com/archive/2019/01/07
70 stories scraped so far.
https://towardsdatascience.com/archive/2019/01/08
80 stories scraped so far.
https://towardsdatascience.com/archive/2019/01/09
90 stories scraped so far.
https://towardsdatascience.com/archive/2019/01/10
100 stories scraped so far.
https://towardsdatascience.com/archive/2019/01/11
110 stories scraped so far.
https://towardsdatascience.com/archive/2019/01/12
