In [ ]:
# Using BeautifulSoup to parse and extract data

In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm

## Extract Title and Comments

In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def extract_content(num_pages, base_url):
    usernames = []
    profile_links = []
    post_titles = []
    comments = []
    contents = []

    # set the headers
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}

    # loop through the specified number of pages
    for page in tqdm(range(1, num_pages + 1)):
        # request the page content
        url = f"{base_url}page-{page}"
        time.sleep(5)
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, "html.parser")

        # extract the username and profile link
        for user in soup.find_all("ul", class_="structItem-parts"):
            usernames.append(user.find("a").text.strip())
            user_link = user.find("a")["href"]
            profile_link = "https://www.diabetes.co.uk" + user_link + "#about"
            profile_links.append(profile_link)

        # extract the post title and link to comments
        for title in soup.find_all("div", class_="structItem-title"):
            title_name = title.find("a", class_="").text.strip()
            link = title.find("a")["href"]
            post_titles.append(title_name)

            # request the comment page and extract the comments
            response = requests.get("https://www.diabetes.co.uk" + link, headers=headers)
            soup = BeautifulSoup(response.content, "html.parser")
            comment_list = []
            self_content = []
            
            for blockquote in soup.find_all('blockquote'): # remove quotes
                blockquote.extract()
                
            comments_section = soup.find_all("div", class_="bbWrapper")
            
            for i, comment in enumerate(comments_section):
                if i == 0:
                    self_content.append(comment.text.strip())
                else:
                    comment_list.append(comment.text.strip())
            contents.append(self_content)
            comments.append(comment_list)

    # create a pandas dataframe and save to CSV
    df = pd.DataFrame({
        "Username": usernames,
        "Profile Link": profile_links,
        "Post Title": post_titles,
        "Content": contents,
        "Comments": comments
    })
    return df

In [11]:
# Young People/Adults
num_pages = 15
base_url = "https://www.diabetes.co.uk/forum/category/young-people-adults.75/"
young_adult = extract_content(num_pages, base_url)
young_adult.to_excel("young_adult_1.xlsx", index=False)

100%|███████████████████████████████████████████| 15/15 [03:47<00:00, 15.19s/it]


In [15]:
# Children & Teens
num_pages = 29
base_url = "https://www.diabetes.co.uk/forum/category/children-teens.46/"
children_teen = extract_content(num_pages, base_url)
children_teen.to_excel('children_teen_1.xlsx', index=False)

100%|███████████████████████████████████████████| 29/29 [07:30<00:00, 15.52s/it]


In [18]:
# Type 1 Diabetes
num_pages = 752
base_url = "https://www.diabetes.co.uk/forum/category/type-1-diabetes.19/"
type1diabetes = extract_content(num_pages, base_url)
type1diabetes.to_excel('type1diabetes_1.xlsx', index=False)

100%|███████████████████████████████████████| 752/752 [3:17:44<00:00, 15.78s/it]


In [21]:
# Parents
num_pages = 52
base_url = "https://www.diabetes.co.uk/forum/category/parents.16/"
parents = extract_content(num_pages, base_url)
parents.to_excel('parents_1.xlsx', index=False)

100%|███████████████████████████████████████████| 52/52 [12:52<00:00, 14.85s/it]


In [23]:
#Merge DataFrame
type1diabetes['platform'] = 'type1diabetes'
young_adult['platform']= 'young_adult' 
children_teen['platform'] = 'children_teen' 
df = pd.concat([type1diabetes, young_adult, children_teen])

## Identify Type 1 from young_adult and child_teen

In [None]:
T1_T2 = pd.concat([young_adult, children_teen])
profile_links_diabetes = list(set(T1_T2['Profile Link'].tolist()))

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from tqdm import tqdm

# Extract profiles with diabetes information
def scrape_diabetes_info(profile_links_diabetes):
    # Configure Chrome options for headless mode
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run Chrome in headless mode (no GUI)

    diabetes_info_dict = {}

    for url in tqdm(profile_links_diabetes):
        # Initialize the WebDriver with the specified options for each URL
        driver = webdriver.Chrome(options=chrome_options)

        # Load the webpage with Selenium
        driver.get(url)

        # Wait for JavaScript to execute (you may need to adjust the wait time)
        import time
        time.sleep(0.1)

        # Get the page source with the modified content (after JavaScript execution)
        page_source = driver.page_source

        # Close the Selenium WebDriver
        driver.quit()

        # Parse the modified page source with BeautifulSoup
        soup = BeautifulSoup(page_source, "html.parser")

        # Find the type of diabetes element
        diabetes_element = soup.find("dt", text="Type of diabetes")

        if diabetes_element:
            diabetes_info = diabetes_element.find_next("dd").text.strip()
            diabetes_info_dict[url] = diabetes_info
        else:
            # Store the profile link as the key and "No Age Information" as the value
            diabetes_info_dict[url] = "No type of diabetes Information"

    return diabetes_info_dict

In [None]:
diabetes_type = scrape_diabetes_info(profile_links_diabetes)
df_diabetes_type = pd.DataFrame(diabetes_type.items(), columns=['Profile Link', 'diabetes_type_info'])
df_diabetes_type.to_excel('diabetes_type_info.xlsx', index=False)

In [100]:
# merge diabetes type info
T1_T2 = T1_T2.merge(df_diabetes_type, on='Profile Link', how='left')
df_t1 = T1_T2[T1_T2['diabetes_type_info']=='Type 1'] # Identify young people with T1

In [104]:
df = pd.concat([df_t1, type1diabetes])

## Extract Age Information and identify young people

BeautifulSoup is parsing the HTML content at the time of the request, and it does not execute JavaScript. It retrieves the original HTML content without any JavaScript modifications. If you want to scrape data from a webpage that relies on JavaScript execution to modify its content, you should use Selenium or another tool capable of executing JavaScript to ensure you get the most up-to-date and modified content.

After get all of the profile_links, extract all the age information for the users.

In [26]:
profile_links = list(set(df['Profile Link'].tolist()))

In [29]:
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run Chrome in headless mode (no GUI)

birthday_info_dict = {}

for url in tqdm(profile_links):
    # Initialize the WebDriver with the specified options for each URL
    driver = webdriver.Chrome(options=chrome_options)

    # Load the webpage with Selenium
    driver.get(url)

    # Wait for JavaScript to execute (you may need to adjust the wait time)
    import time
    time.sleep(0.1)

    # Get the page source with the modified content (after JavaScript execution)
    page_source = driver.page_source

    # Close the Selenium WebDriver
    driver.quit()

    # Parse the modified page source with BeautifulSoup
    soup = BeautifulSoup(page_source, "html.parser")

    # Find the birthday element
    birthday_element = soup.find("dt", text="Birthday")

    if birthday_element:
        # Extract the birthday information from the next sibling (dd) element
        birthday_info = birthday_element.find_next("dd").text.strip()
        # Store the profile link as the key and age as the value in the dictionary
        birthday_info_dict[url] = birthday_info
    else:
        # Store the profile link as the key and "No Age Information" as the value
        birthday_info_dict[url] = "No Age Information"



100%|████████████████████████████████████| 6350/6350 [19:32:23<00:00, 11.08s/it]


In [39]:
df_birth = pd.DataFrame(birthday_info_dict.items(), columns=['Profile Link', 'Age_info'])
df_birth.to_excel('birthday_info.xlsx', index=False)

In [110]:
# merge age infor
merged_df = df.merge(df_birth, how='left', on='Profile Link')

In [112]:
# Define a function to extract age from text
def extract_age(text):
    if "No Age Information" in text:
        return np.nan
    else:
        age_match = re.search(r'Age:\s*(\d+)', text)
        if age_match:
            return int(age_match.group(1))
        else:
            return np.nan

In [113]:
merged_df.rename(columns={'Age':'Age_info'}, inplace=True)
merged_df['Age'] = merged_df['Age_info'].apply(extract_age)
merged_df.drop(['diabetes_type_info', 'Age_info'], axis=1, inplace=True)

In [117]:
merged_df.to_excel('diabetes.co.uk.xlsx') # corpus with unclear age
df_age = merged_df[~merged_df['Age'].isna()] 
df_age.to_excel('corpus_with_age_info.xlsx') # corpus with exact age 

In [ ]:
# Identify young people with T1
young_df = df_age[(df_age['Age']<27) & (df_age['Age']>12)]