In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import datetime
import pandas as pd
from tqdm import tqdm
import numpy as np

In [2]:
# Set headers
heads = requests.utils.default_headers()
heads.update({
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
   })

In [3]:
# helper functions to retrieve detials about each post
def getUser(header):
    user = ''
    user_tag = header.find("a", class_="user")
    if user_tag:
        user = user_tag.get_text()
    return user

def getTimestamp(header):
    time = ''
    date = ''
    tag_datetime = header.contents[-1]
    if tag_datetime:
        time = tag_datetime.contents[0].contents[0]
    if len(tag_datetime.contents) == 3:
        date = tag_datetime.contents[-1].contents[0]
    elif len(tag_datetime.contents) == 6:
        date = tag_datetime.contents[2].contents[0] + ' ' + tag_datetime.contents[4].contents[0]
    date_time = '{} {}'.format(date, time) 
    return date_time

def getLikesShares(body):
    likes = ''
    shares = ''
    s_class = body.find("p", class_='s')
    if s_class:
        likes = s_class.find_all('b')[0].get_text()
        shares = s_class.find_all('b')[1].get_text()
    return [likes, shares]

def getQuote(body):
    quotes = []
    content = body.find('div', class_='narrow')
    blockquotes = content.find_all('blockquote')
    for blockquote in blockquotes:
        a_tag = blockquote.find('a')
        if a_tag:
            _id = a_tag.get('href')
            quotes.append(_id)
    return quotes

def getText(body):
    content = body.find('div', class_='narrow')
    text = ''
    while content.blockquote:
        content.blockquote.extract()
    text = content.get_text()
    return text

def getPostID(header):
    post_id = ''
    name = header.find_all('a')[0].get('name')
    if name:
        post_id = name
    return post_id

def is_post_equal(post1, post2):
    return post1 == post2

def get_thread(link):
    _id = ''
    name = link.get('name')
    if name not in ('top', None):
        _id = name
    return _id

In [4]:
# retrieve details of each post
def parse_post(header, body):
    post = {}
    post['posted'] = getTimestamp(header)
    post['user'] = getUser(header)
    post['post_id'] = getPostID(header)
    post['text'] = getText(body)
    post['has_quote'] = True if getQuote(body) else False
    post['quotes'] = getQuote(body)
    post['shares'] = getLikesShares(body)[1]
    post['likes'] = getLikesShares(body)[0]
    post['retrieved'] = datetime.now().strftime("%H:%M:%S %d-%m-%Y")
    return post

In [5]:
# retrieve posts from thread
def parse_thread(thread):
    page = 0 # start from the first page
    next_page = True
    index_post = ''
    previous_index_post = ''

    data = []
    #import pdb; pdb.set_trace()
    while next_page:
        start_url = 'https://www.nairaland.com/{}/{}'.format(thread, page)
        r1 = requests.get(start_url, heads)
        thread_html = BeautifulSoup(r1.text, 'lxml')

        headers = thread_html.find_all('td', class_='bold l pu')
        bodys = thread_html.find_all('td', class_='l w pd')

        #retrieve first post in the thread 
        index_post = getPostID(headers[0]) 

        if page > 0:
            # compare first post on current page with previous page
            if is_post_equal(index_post, previous_index_post):
                break
        for i in range(len(headers)):
            header = headers[i]
            body = bodys[i]
            post = parse_post(header, body)
            post.update({'page_no': page, 'thread':thread})
            data.append(post)
        
        previous_index_post = index_post
        page += 1

    print('Thread: {} Page: {} No of Posts {}'.format(thread, page, len(data)))

    return data

In [6]:
def get_thread(t):
    thread = {}
    thread['thread_id'] = getThreadId(t)
    thread['title'] = getThreadTitle(t)
    thread['length'] = getThreadLength(t)
    thread['views'] = getThreadView(t)
    thread['author'] = getThreadAuthor(t)
    return thread

def getThreadId(thread):
    _id = ''
    name = thread.find('a').get('name')
    if name not in ('top', None):
        _id = name
    return _id

def getThreadTitle(thread):
    title = ''
    b_tag = thread.find('b')
    if b_tag:
        title = b_tag.get_text()
    return title

def getThreadLength(thread):
    length = ''
    span_tag = thread.find('span', class_='s')
    if span_tag:
        length_tag = span_tag.find_all('b')[1]
        if length_tag:
            length = length_tag.get_text()
    return length

def getThreadView(thread):
    views = ''
    span_tag = thread.find('span', class_='s')
    if span_tag:
        view_tag = span_tag.find_all('b')[2]
        if view_tag:
            views = view_tag.get_text()
    return views

def getThreadAuthor(thread):
    author = ''
    span_tag = thread.find('span', class_='s')
    if span_tag:
        author_tag = span_tag.find('a')
        if author_tag:
            author = author_tag.get_text()
    return author

In [7]:
def parse_forum(forum):
    start_url = 'https://www.nairaland.com/{}/posts/'.format(forum)
    r1 = requests.get(start_url, heads)
    raw_html = BeautifulSoup(r1.text, 'html5lib')

    # create empty df to store posts from parsed threads in the forum
    df_posts = pd.DataFrame()

    threads = []

    # retrieve the number of pages in the forum
    page = 499 # int(raw_html.select('body > div > p:nth-child(7)')[0].select('b')[1].text) #
    
    for i in tqdm(range(page, 501)):
        next_page = start_url + '{}'.format(i)
        r2 = requests.get(next_page, heads)
        forum_html = BeautifulSoup(r2.text, 'html5lib')
        thread_tags = forum_html.find_all('td', attrs = {'id' : True})
        for tag in thread_tags:
            thread = get_thread(tag)
            threads.append(thread)
    
    df_threads = pd.DataFrame(threads)
    df_threads.to_csv('../data/raw/{}_threads.csv'.format(forum))

    iter = 0
    for thread in threads:
        res = parse_thread(thread['thread_id'])
        df_res = pd.DataFrame(res)
        df_res['forum'] = forum
        df_posts = df_posts.append(df_res)

        iter += 1
        # save after every 500 pages
        if iter % 500 == 0:
            df_posts.to_csv('../data/raw/{}_{}.csv'.format(forum, iter))
    return df_posts

In [8]:
forum = 'health'
df = parse_forum(forum)

100%|██████████| 2/2 [00:01<00:00,  1.92it/s]
Thread: 4862141 Page: 1 No of Posts 3
Thread: 4851341 Page: 1 No of Posts 1
Thread: 4841171 Page: 1 No of Posts 3
Thread: 4834979 Page: 1 No of Posts 3
Thread: 4862009 Page: 1 No of Posts 3
Thread: 4851268 Page: 1 No of Posts 2
Thread: 4840565 Page: 1 No of Posts 3
Thread: 4834827 Page: 1 No of Posts 3
Thread: 4858057 Page: 1 No of Posts 2
Thread: 4839611 Page: 1 No of Posts 3
Thread: 4834790 Page: 1 No of Posts 3
Thread: 4857581 Page: 1 No of Posts 2
Thread: 4849943 Page: 1 No of Posts 3
Thread: 4839528 Page: 1 No of Posts 3
Thread: 4834265 Page: 1 No of Posts 3
Thread: 4857105 Page: 1 No of Posts 3
Thread: 4849237 Page: 1 No of Posts 2
Thread: 4839482 Page: 1 No of Posts 3
Thread: 4834093 Page: 1 No of Posts 2
Thread: 4855830 Page: 1 No of Posts 2
Thread: 4848756 Page: 1 No of Posts 2
Thread: 4838868 Page: 1 No of Posts 3
Thread: 4833647 Page: 1 No of Posts 2
Thread: 4855578 Page: 1 No of Posts 3
Thread: 4848735 Page: 1 No of Posts 3
Thre

In [9]:
df.to_csv('../data/raw/{}_sample.csv'.format(forum))