In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import datetime
from tqdm import tqdm 

In [2]:
# set headers
heads = requests.utils.default_headers()
heads.update({
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
})


In [7]:
# helper functions to retrieve detials from thread post
def getUser(header):
    """Attempt to get user from post."""
    user = ''
    user_tag = header.find("a", class_="user")
    if user_tag:
        user = user_tag.get_text()
    return user

def getTimestamp(header):
    """Attempt to get timestamp of post."""
    time = ''
    date = ''
    tag_datetime = header.contents[-1]
    if tag_datetime:
        time = tag_datetime.contents[0].contents[0]
    if len(tag_datetime.contents) == 3:
        date = tag_datetime.contents[-1].contents[0]
    elif len(tag_datetime.contents) == 6:
        date = tag_datetime.contents[2].contents[0] + ' ' + tag_datetime.contents[4].contents[0]
    date_time = '{} {}'.format(date, time) 
    return date_time

def getLikesShares(body):
    """Attempt to get no of likes and shares."""
    likes = ''
    shares = ''
    s_class = body.find("p", class_='s')
    if s_class:
        likes = s_class.find_all('b')[0].get_text()
        shares = s_class.find_all('b')[1].get_text()
    return [likes, shares]

def getQuote(body):
    """Attempt to get quotes from post"""
    quotes = []
    content = body.find('div', class_='narrow')
    blockquotes = content.find_all('blockquote')
    for blockquote in blockquotes:
        a_tag = blockquote.find('a')
        if a_tag:
            _id = a_tag.get('href')
            quotes.append(_id)
    return quotes

def getText(body):
    """Attempt to get text from post"""
    content = body.find('div', class_='narrow')
    text = ''
    while content.blockquote:
        content.blockquote.extract()
    text = content.get_text()
    return text

def getPostID(header):
    """Attempt to get id of post"""
    post_id = ''
    name = header.find_all('a')[0].get('name')
    if name:
        post_id = name
    return post_id

def getGender(header):
    """Attempt to get user from post."""
    gender = ''
    female_tag = header.find("span", class_="f")
    male_tag = header.find("span", class_="m")
    if male_tag:
        gender = male_tag.get_text()
    elif female_tag:
        gender = female_tag.get_text()
    return gender

def parse_post(header, body):
    """retrieve details of each post"""
    post = {}
    post['posted'] = getTimestamp(header)
    post['user'] = getUser(header)
    post['gender'] = getGender(header)
    post['post_id'] = getPostID(header)
    post['text'] = getText(body)
    post['has_quote'] = True if getQuote(body) else False
    post['quotes'] = getQuote(body)
    post['shares'] = getLikesShares(body)[1]
    post['likes'] = getLikesShares(body)[0]
    post['retrieved'] = datetime.now().strftime("%H:%M:%S %d-%m-%Y")
    return post

def is_post_equal(post1, post2):
    return post1 == post2

In [8]:
def parse_thread(thread):
    """retrieve posts from thread"""
    page = 0 # start from the first page
    next_page = True
    index_post = ''
    previous_index_post = ''

    data = []

    while next_page:
        start_url = 'https://www.nairaland.com/{}/{}'.format(thread, page)
        r1 = requests.get(start_url, heads)
        thread_html = BeautifulSoup(r1.text, 'lxml')

        headers = thread_html.find_all('td', class_='bold l pu')
        bodys = thread_html.find_all('td', class_='l w pd')

        #retrieve first post in the thread
        if len(headers) > 1:
            index_post = getPostID(headers[0]) 

        if page > 0:
            # compare first post on current page with previous page
            if is_post_equal(index_post, previous_index_post):
                break

        for i in range(len(headers)):
            header = headers[i]
            body = bodys[i]
            post = parse_post(header, body)
            post.update({'page_no': page, 'thread':thread})
            data.append(post)
        
        previous_index_post = index_post
        page += 1
    print('Thread: {}, No of Page(s): {}, No of Post(s) {}'.format(thread, page, len(data)))

    return data

In [2]:
list_of_threads=[5879703,5812390,6052669,5795526, 5812795, 5774874, 6020419, 5760985, 5908926, 5764410, 5744920, 5866113, 5772162, 5762527, 5761835]

In [3]:
len(list_of_threads)

15

In [9]:
parse_thread(5762527)

Thread: 5762527, No of Page(s): 15, No of Post(s) 446


[{'posted': 'Mar 29 12:19pm',
  'user': 'SinisterX',
  'gender': 'm',
  'post_id': '87885451',
  'text': 'Coronavirus: A Facebook User’s Experience Of Getting Tested In AbujaATTENTION: PRESIDENCY & FED. MINISTRY OF HEALTH To tell you the truth, NCDC can\'t do anything for you because they are not ready.All this PR they are doing they are assuming people will not try that\'s why.On Wednesday (25/03/2020) I and a colleague of mine decided we want to be sure of our status before going into #selfisolation.So we went through the following steps1. We called the toll free line and after all the normal recorded stories, when we chose to speak to an agent they end the call.2. We went to University of Abuja Teaching Hospital, Gwagwalada, they told us they don\'t do testing there, that the hospital is only an isolation center. They refer us back to NCDC office in town, told us that\'s where testing lab is.3. We drove to NCDC office at Jabi. There are two NCDC offices there in close proximity. The

In [5]:
# import list of threads from health forum
df_topics = pd.read_csv('../data/raw/health_threads.csv')

In [6]:
# slice and start from previous collection
df_topics = df_topics[69500:]

In [7]:
# retrieve posts from each thread
iter = 69500 # start from the previous collection
forum = 'health'
df_posts = pd.DataFrame()
for index, row in tqdm(df_topics.iterrows()):
    res = parse_thread(row['thread_id'])
    df_res = pd.DataFrame(res)
    df_res['forum'] = forum
    df_posts = df_posts.append(df_res)

    iter += 1
    # save after every 500 pages
    if iter % 500 == 0:
        df_posts.to_csv('../data/raw/{}_{}.csv'.format(forum, iter))
df_posts.to_csv('../data/raw/{}.csv'.format(forum))

4:58,  2.46it/s]Thread: 138668, No of Page(s): 1, No of Post(s) 1
32953it [4:04:58,  2.42it/s]Thread: 134576, No of Page(s): 1, No of Post(s) 1
32954it [4:04:59,  2.55it/s]Thread: 130172, No of Page(s): 1, No of Post(s) 1
32955it [4:04:59,  2.55it/s]Thread: 150393, No of Page(s): 1, No of Post(s) 1
32956it [4:04:59,  2.78it/s]Thread: 144526, No of Page(s): 1, No of Post(s) 1
32957it [4:05:00,  2.69it/s]Thread: 138469, No of Page(s): 1, No of Post(s) 1
32958it [4:05:00,  2.64it/s]Thread: 134411, No of Page(s): 1, No of Post(s) 1
32959it [4:05:00,  2.69it/s]Thread: 144492, No of Page(s): 1, No of Post(s) 1
32960it [4:05:01,  2.69it/s]Thread: 138334, No of Page(s): 1, No of Post(s) 1
32961it [4:05:01,  2.75it/s]Thread: 134208, No of Page(s): 1, No of Post(s) 1
32962it [4:05:01,  2.74it/s]Thread: 144359, No of Page(s): 1, No of Post(s) 1
32963it [4:05:02,  2.67it/s]Thread: 138097, No of Page(s): 1, No of Post(s) 1
32964it [4:05:02,  2.72it/s]Thread: 133911, No of Page(s): 1, No of Post(s) 

In [None]:
df_posts.to_csv('../data/raw/{}.csv'.format(forum))
