# Data Extraction

The objective of this Python notebook is to extract scam reports from the 'Scam Alert' website using web-scrapping techniques.

## Import the necessary libraries

In [4]:
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
import math
import requests
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [19]:
# Define the url of interest
url = "https://www.scamalert.sg/stories"

In [20]:
# Instantiate a webdriver for Google Chrome
#driver = webdriver.Chrome()
driver = webdriver.Chrome(ChromeDriverManager().install())

[WDM] - Current google-chrome version is 83.0.4103
[WDM] - Get LATEST driver version for 83.0.4103
[WDM] - Driver [C:\Users\User\.wdm\drivers\chromedriver\win32\83.0.4103.39\chromedriver.exe] found in cache


 


In [21]:
# Open the url page
driver.get(url)

In [22]:
# Create an empty list to store the url links later
links = []

In [23]:
# Define the xpath of an element on a webpage 
next_xpath = '/html/body/form/main/div[2]/div/div[4]/div[2]/div/div/h4/a'

In [24]:
# Make the content from page source easy to read with BeautifulSoup
soup = BeautifulSoup(driver.page_source)

In [25]:
# Extract the total number of results from page source
num_result = int(soup.find('div', class_="col-lg-5 text-muted mb-2 pt-2").text.split()[3])

In [26]:
# Determine the number of web pages 
num_pages = math.ceil(num_result / 6)
print("Total number of pages:", num_pages)

Total number of pages: 800


## Extract URL from each page

In [27]:
# For loop to iterate through each page 
# for i in range(num_pages):
for i in range(100):
    
    # Display current page number
    print("Page number:", i+1)
    
    # Make the content from page source easy to read with BeautifulSoup
    soup = BeautifulSoup(driver.page_source)
    
    # Find all h4 containers with class 'card-title'.
    for link in soup.find_all('h4', class_='card-title'):
        
        # Extract the 'href' link and append it to the empty list
        links.append(link.a['href'])
    
    # As long as it is not the last page, find the button that links to the next page, and click it
    if i != (num_pages - 1):
        button = driver.find_element_by_link_text(str(i+2)) 
        driver.execute_script("arguments[0].click();", button)
        element = WebDriverWait(driver, 20).until(lambda x: x.find_element_by_xpath(next_xpath))

Page number: 1
Page number: 2
Page number: 3
Page number: 4
Page number: 5
Page number: 6
Page number: 7
Page number: 8
Page number: 9
Page number: 10
Page number: 11
Page number: 12
Page number: 13
Page number: 14
Page number: 15
Page number: 16
Page number: 17
Page number: 18
Page number: 19
Page number: 20
Page number: 21
Page number: 22
Page number: 23
Page number: 24
Page number: 25
Page number: 26
Page number: 27
Page number: 28
Page number: 29
Page number: 30
Page number: 31
Page number: 32
Page number: 33
Page number: 34
Page number: 35
Page number: 36
Page number: 37
Page number: 38
Page number: 39
Page number: 40
Page number: 41
Page number: 42
Page number: 43
Page number: 44
Page number: 45
Page number: 46
Page number: 47
Page number: 48
Page number: 49
Page number: 50
Page number: 51
Page number: 52
Page number: 53
Page number: 54
Page number: 55
Page number: 56
Page number: 57
Page number: 58
Page number: 59
Page number: 60
Page number: 61
Page number: 62
Page number: 63
P

In [28]:
# Convert the list into a pandas dataframe
all_url_links = pd.DataFrame({'url': links})
print(all_url_links)

                                          url
0    /stories-details/Story-17Jul2020150318PM
1    /stories-details/Story-17Jul2020133721PM
2    /stories-details/Story-17Jul2020094730AM
3    /stories-details/Story-16Jul2020203342PM
4    /stories-details/Story-16Jul2020031601AM
..                                        ...
595  /stories-details/Story-18Feb2020121823PM
596  /stories-details/Story-18Feb2020120844PM
597  /stories-details/Story-18Feb2020113217AM
598  /stories-details/Story-18Feb2020103222AM
599  /stories-details/Story-18Feb2020100945AM

[600 rows x 1 columns]


In [33]:
# Save the dataframe into a csv file
all_url_links.to_csv("Data/scam_url_links.csv")

## Scaping from each URL

In [51]:
url_links = pd.read_csv("Data/scam_url_links.csv").drop(columns="Unnamed: 0")

In [57]:
full_url = []
header = "https://www.scamalert.sg"
for index, link in url_links.iterrows():
    full_url.append(header + link['url'])

In [58]:
scam_type = []
incident_description = []
scammer_details = []
submission_details = []

for i, url in enumerate(full_url):
    print("S/N:", i+1)
    print("url:", url)
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    # Only extract the details if it is a valid webpage 
    # A webpage is assumed to be valid if it contains the 'sf_colsIn jumbotron bg-dark' class in 'div' container
    if soup.find('div', class_="sf_colsIn jumbotron bg-dark") != None:
    
        sub_detail = soup.find_all('p', class_='lead')[0] if soup.find_all('p', class_='lead')[0] else '-'
        submission_details.append(sub_detail.text)
    
        s_type = soup.find_all('a', class_='text-primary')[0] if soup.find_all('a', class_='text-primary')[0] else '-'
        scam_type.append(s_type.text)
        
        i_description = soup.find('div', class_="lead") if soup.find('div', class_="lead") else '-'
        incident_description.append(i_description.text)
        
        s_details = soup.find_all('p', class_="lead")[1] if soup.find_all('p', class_="lead")[1] else '-'
        scammer_details.append(s_details.text)

S/N: 1
url: https://www.scamalert.sg/stories-details/Story-17Jul2020150318PM
S/N: 2
url: https://www.scamalert.sg/stories-details/Story-17Jul2020133721PM
S/N: 3
url: https://www.scamalert.sg/stories-details/Story-17Jul2020094730AM
S/N: 4
url: https://www.scamalert.sg/stories-details/Story-16Jul2020203342PM
S/N: 5
url: https://www.scamalert.sg/stories-details/Story-16Jul2020031601AM
S/N: 6
url: https://www.scamalert.sg/stories-details/Story-15Jul2020222224PM
S/N: 7
url: https://www.scamalert.sg/stories-details/Story-15Jul2020162116PM
S/N: 8
url: https://www.scamalert.sg/stories-details/Story-15Jul2020150200PM
S/N: 9
url: https://www.scamalert.sg/stories-details/Story-15Jul2020140600PM
S/N: 10
url: https://www.scamalert.sg/stories-details/Story-15Jul2020132017PM
S/N: 11
url: https://www.scamalert.sg/stories-details/Story-15Jul2020102421AM
S/N: 12
url: https://www.scamalert.sg/stories-details/Story-14Jul2020153904PM
S/N: 13
url: https://www.scamalert.sg/stories-details/Story-14Jul20201248

In [62]:
scam_df = pd.DataFrame({
        'submission_details': submission_details,
        'scam_type': scam_type,
        'incident_description': incident_description,
        'scammer_details': scammer_details
        })
    
scam_df

Unnamed: 0,submission_details,scam_type,incident_description,scammer_details
0,Anonymous | 17 Jul 2020,Impersonation Scam,\r\n They call me by whatsapp ...,\r\n Name: I only know ...
1,Anonymous | 17 Jul 2020,Phishing Scam,\r\n it happened this morning 0...,\r\n Name: Ministry of ...
2,Anonymous | 17 Jul 2020,Phishing Scam,\r\n I rceived a call from a la...,\r\n Name: SIngtel\r\n ...
3,Anonymous | 16 Jul 2020,Impersonation Scam,\r\n details: I received a call...,\r\n Name: Singapore hi...
4,Anonymous | 16 Jul 2020,Phishing Scam,\n1) An impersonated junior technical staff ca...,"\r\n Name: SINGTEL, RON..."
...,...,...,...,...
123,Anonymous | 08 Jun 2020,Impersonation Scam,\nA caller by the name mike with a strong indi...,\r\n Name: SinGTEL\r\n ...
124,Anonymous | 07 Jun 2020,Internet Love Scam,\nLadies/gentlman \n\nplease be careful when y...,\r\n Name: CHINA PROFIL...
125,Anonymous | 06 Jun 2020,Impersonation Scam,\r\n Text message was sent to m...,\r\n Name: cid police o...
126,Anonymous | 06 Jun 2020,Phishing Scam,\r\n I received a call from 911...,\r\n Name: Singtel\r\n ...


## Save dataframe as a CSV file

In [70]:
# Save the dataframe into a csv file
scam_df.to_csv("Data/scam_raw_dataset.csv")