In [3]:
import pandas as pd
import json

import requests
from bs4 import BeautifulSoup

import gc
import time
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

### 1.0 Get a list of webpages to scrape

In [3]:
sections = [
"lifestyle",
"beauty",
"book",
"resepi"
]

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'} 

In [66]:
def crawl(url):
    while True:
        try:
            r = requests.get(url,headers = headers)
            break
        except Exception as e:
            print(e)
            time.sleep(1.0)

    soup = BeautifulSoup(r.text, "lxml")

    block = soup.find('div',attrs = {"class":"main section"})
    
    if block is None:
        return

    for link in block.find_all('h3', attrs = {"class":"post-title entry-title"}):
        try:
            href = link.find('a').get('href')
            hrefs.append(href)
        except:
            pass

This website links **aren't related by page numbers or have any order to them** e.g., https://www.leaazleeya.com/search/label/lifestyle?updated-max=2023-06-24T01:04:00%2B08:00&max-results=20&start=6&by-date=false is what we get when we click the `OLDER POSTS` button. That's troublesome. Instead, I'll first scrape every first page (e.g., lifestyle, beauty, etc.) for `OLDER POSTS` link (if it exists) at the end of every page, and subsequently, another `OLDER POSTS` page (until there isn't).

In [55]:
pages = []
i = 0
for t in sections:
    print(t)
    url = f'https://www.leaazleeya.com/search/label/{t}'
    pages.append(url)
    while t is not None:
        r = requests.get(url, headers=headers)
        soup = BeautifulSoup(r.content, "lxml")
        try:
            a = soup.find('a', attrs={'class': 'blog-pager-older-link'}).get('href')
        except:
            a = None
        if a is not None:
            url = a
            pages.append(url)
            i +=1
        else:
            break

print(f'Links collected: {i}')

lifestyle
beauty
book
resepi
Links collected: 52


In [73]:
max_worker = 10

hrefs = []
for t in pages:
    r = requests.get(t, headers=headers)
    soup = BeautifulSoup(r.content, "lxml")
    a = soup.find_all('a')
    a = [a_.get('href') for a_ in a if a_.get('href')]
    
    for href in a:
        crawl(href)

hrefs2 = list(set(hrefs))
print(f'Num. of unique links: {len(hrefs2)}')
with open(f'leaazleeya-link.json', 'a') as f:
    json.dump(hrefs2, f)

Num. of unique links: 544


### 2.0 Get webpage content (headers, paragraphs, links, etc.)

In [94]:
url = []
with open(f'leaazleeya-link.json') as fopen:
    href = json.load(fopen)
url.extend(href)

In [95]:
def process_url(x):
    
    while True:
        try:
            r = requests.get(x, headers=headers)
            break
        except Exception as e:
            print(e)
            time.sleep(5.0)
    
    soup = BeautifulSoup(r.text, "lxml")

    try:
        headline = soup.find('h3', class_="post-title entry-title").text
        h = soup.find('div', class_="post-body entry-content") # post-body entry-content
        content = h.text

    except Exception as e:
        print('error in link:'+ x)
        print(e)
        return None

    data = {'url': x, 'headline': headline, 'content': content}
    return data

In [None]:
max_worker = 20

for i in tqdm(range(0, len(url), max_worker)):
    gc.collect()
    with ThreadPoolExecutor(max_workers=max_worker) as executor:
        futures = {executor.submit(process_url, t): t for t in url[i: i + max_worker]}

    for future in as_completed(futures):
        result = future.result()
        if result:
            with open(f'leaazleeya-complete-batch.jsonl', 'a') as final:
                json.dump(result, final)
                final.write('\n')

In [100]:
test = pd.read_json('leaazleeya-complete-batch.jsonl', lines=True)
print(test)

                                                  url  \
0   https://www.leaazleeya.com/2016/06/dulu-suka-m...   
1   https://www.leaazleeya.com/2016/11/novel-siti-...   
2   https://www.leaazleeya.com/2018/05/ikan-talapi...   
3   https://www.leaazleeya.com/2023/05/cepat-penat...   
4   https://www.leaazleeya.com/2017/09/review-nove...   
5   https://www.leaazleeya.com/2015/12/aku-ingin-b...   
6   https://www.leaazleeya.com/2016/09/tazkirah-ju...   
7   https://www.leaazleeya.com/2018/09/yakin-cerah...   
8   https://www.leaazleeya.com/2016/08/dapat-hadia...   
9   https://www.leaazleeya.com/2018/05/bb-jelly-lo...   
10  https://www.leaazleeya.com/2023/04/tena-memili...   
11  https://www.leaazleeya.com/2014/05/lagu-time-k...   
12  https://www.leaazleeya.com/2013/11/indah-perja...   
13  https://www.leaazleeya.com/2021/06/tip-untuk-m...   
14  https://www.leaazleeya.com/2016/03/solat-lah.html   
15  https://www.leaazleeya.com/2015/02/buatmu-seor...   
16  https://www.leaazleeya.com/

In [101]:
test['content'][0]

'\n\n\n\nAssalamualaikum wbt..\n\n\n\n\n\n\n\nKorang suka make up tak??? ehh aku tanye kaum hawa aje yek.. ekeke... hurmm ni nak cite ni.. Dolu-dolu masa zaman perang aku dulu, aku memang suka sangat make up ni.. Pantang keluar aje mesti kene make up cantik-cantik walaupun dah tau tak berapa nak cantik.. haha.. Fefiling cantik laaa bila dah make up tu.. hoho.. gile ahhh..!!\n\n\n\nDolu-dolu make up mau tebal 8 henci.. Foundation mesti mau yang thorrrbaeekkk punye.. Pantang ada yang buat iklan make up yang gut-gut mesti nak beli punye.. Foundation yang paling best aku guna naturactor... hoh kesan dia fuhhh mabeles... thorrrbaiikk... yang lain hareemmmm tak best..!!\n\n\n\n\n\nFoundation Naturactor\n\n\n\nNi la foundationnya.. Tapi dulu banyak yang tak ori.. Susah benar nak jumpe yang ori.. Yang tak ori memang hampeh la hasilnya.. Aku usaha carik yang ori sampailah jumpa.. Alhamdulillah akhirnya jumpa.. haha.. Gile ahhh sekali agik..!! Jom layan gamboo lama-lama bersawang aku..\n\n\n\n\n