In [10]:
# !pip install bs4 pandas -q

In [167]:
import pandas as pd
pd.set_option('display.max_rows', 500)
import json

import requests
from bs4 import BeautifulSoup

In [5]:
test_url = "https://en.wikipedia.org/wiki/Road_signs_in_Malaysia"

In [16]:
response = requests.get(
    test_url,
)

print(response)

<Response [200]>


In [42]:
soup = BeautifulSoup(response.text, "html.parser")

#### 1.0 Scrape all images that are attached to a gallery

One approach is to just use `soup.findAll("img").get("src")`, however, the images turned out to be very small. Hence, we decided to grab the ones with larger dimensions.

In [58]:
len(soup.findAll("ul", class_="gallery mw-gallery-traditional"))

30

In [75]:
for i in soup.findAll("ul", class_="gallery mw-gallery-traditional"):
    for j in i.findAll("a", class_="mw-file-description"):
        image_href_low = "https://en.wikipedia.org" + j.get('href')
        try:
            image_title = j.get("title")
        except:
            image_title = ""

        data = {
            "image_href": image_href_low,
            "image_title": image_title,
        }

        with open(f'wikipedia-scrape-image-low-res.jsonl', 'a') as final:
            json.dump(data, final)
            final.write('\n')

#### 2.0 Enrich dataset with Road Sign type and get High-res images

In [117]:
def get_high_res_href(url_img: str) -> str:
    """
    A function that takes a low resolution image URL (scraped at 
    the main Wikipedia page) and get's the higher resolution 
    image URL by getting the Wikipedia Commons
    link.
    """

    response = requests.get(
        url_img,
    )

    soup = BeautifulSoup(response.text, "html.parser")

    div_ = soup.find("div", class_="fullImageLink")

    img_url = div_.find("a").get("href")

    return "https:" + img_url

In [118]:
# Test function
test_1 = get_high_res_href("https://en.wikipedia.org/wiki/File:Jkr-ft1.svg")
test_1

//upload.wikimedia.org/wikipedia/commons/5/54/Jkr-ft1.svg


'https://upload.wikimedia.org/wikipedia/commons/5/54/Jkr-ft1.svg'

In [119]:
wiki_df = pd.read_json('wikipedia-scrape-image-low-res.jsonl', lines=True)
wiki_df.head(3)

Unnamed: 0,image_href,image_title
0,https://en.wikipedia.org/wiki/File:Jkr-ft1.svg,Federal roads route code shield
1,https://en.wikipedia.org/wiki/File:Jkr-k602.svg,State roads route code shield
2,https://en.wikipedia.org/wiki/File:E12-LLM.png,Expressway (toll road) route code shield


In [145]:
# Apply the get_high_res_href function to create a new column 'high_res_href'
wiki_df['high_res_href'] = wiki_df['image_href'].apply(get_high_res_href)
wiki_df.head(3)

Unnamed: 0,image_href,image_title,high_res_href
0,https://en.wikipedia.org/wiki/File:Jkr-ft1.svg,Federal roads route code shield,https://upload.wikimedia.org/wikipedia/commons...
1,https://en.wikipedia.org/wiki/File:Jkr-k602.svg,State roads route code shield,https://upload.wikimedia.org/wikipedia/commons...
2,https://en.wikipedia.org/wiki/File:E12-LLM.png,Expressway (toll road) route code shield,https://upload.wikimedia.org/wikipedia/commons...


In [172]:
def map_road_sign_type(index: int):
    """
    Helper function to manually map the dataframe's index with Wikipedia's article copywriting.
    Future coder's can help make this more efficient.

    ### Arguments
    - `index`: Pandas dataframe index

    ### Returns
    A **tuple** with the following arguments:
    - Road Sign Type
    - Road Sign Description
    """
    if index < 5:
        return "State", \
               "State roads use letters that correspond to each state. (refer here for state letter codes/plate numbers)"
    elif index < 48:
        return "Warning sign", \
               "Malaysian warning signs are diamond-shaped or rectangular and are yellow and black or red and white in colour."
    elif index < 70:
        return "Prohibition sign", \
               "Malaysia prohibition signs are round with red outline and black pictogram."
    elif index < 82:
        return "Mandatory sign", \
               "Mandatory instruction signs are round with blue backgrounds and white pictogram. These are also used in signifying specific vehicle type lanes."
    elif index < 89:
        return "Speed Limit sign", \
               "These signs show speed limit on roads."
    elif index < 127:
        return "Construction/Temporary sign", \
               "The construction signs in Malaysia are diamond-shaped placed on rectangular sign and are orange and black in colour."
    elif index < 155:
        return "Information sign", \
               "Malaysian information signs are blue."
    elif index < 161:
        return "Directional and distance sign: Motorcycle lane", \
               "Malaysian motorcycle lane signs are blue."
    elif index < 226:
        return "Directional and distance sign: Expressway sign", \
               "Expressway signs have a green background. If the sign is not located on an expressway but is leading to one, it will have a blue background with green box in it."
    elif index < 234:
        return "Directional and distance sign: Expressway signs (Old format)", \
               "Outdated sign designs that are no longer in use."
    elif index < 263:
        return "Directional and distance sign: Non-tolled Federal, State and Municipal Roads", \
            """
            Malaysian road signs are blue and used for federal, state and municipal roads.
            - Blue with white letters signs for major destinations
            - Maroon with white letters signs for recreational places/tourist spot
            - Blue with yellow letters signs for street names
            - White with green letters signs for specific places/buildings
            - Green with yellow letters signs for government buildings/institution
            - White with blue letters signs for residential area
            """
    elif index < 268:
        return "Directional and distance sign: Non-tolled Federal, State and Municipal Roads (Old format)", \
               ""
    elif index < 272:
        return "Asian Highway route sign", \
               "As part of the Asian Highway Network."
    elif index < 273:
        return "Border sign: International border sign", \
               "Border signs in Malaysia are green for international and state and blue for district."
    elif index < 275:
        return "Border sign: State border sign", \
               "Border signs in Malaysia are green for international and state and blue for district."
    elif index < 279:
        return "Border sign: District border sign", \
               "Border signs in Malaysia are green for international and state and blue for district."
    elif index < 293:
        return "Institution and building sign", \
               """
                These are other important signs in Malaysia such as government institutions and tourist destinations.

                - White with black letters for towns and other settlements.
                - Green with orange letters for government institutions.
                - White with green letters and Maroon with white letters for tourist destinations.
                """
    elif index < 300:
        return "Tourist destination sign", "Malaysian tourist destination signs are in maroon with white and black icons."
    elif index < 303:
        return "Weighing bridge sign", "There is also a signs for weighing bridge."
    elif index < 308:
        return "Road name sign", "Road name sign in Malaysia have many different colours and styles according the local authority to design with them."
    elif index < 313:
        return "Road markings: Centre lines", \
               """ 
               Road markings in Malaysia primarily use thermoplastic and are white. Yellow markings are usually for road shoulders, 
               construction or temporary markings and parking.
               
               Centre lines divide the road into either direction.
               """
    elif index < 317:
        return "Road markings: Edge lines", \
               """ 
               Road markings in Malaysia primarily use thermoplastic and are white. Yellow markings are usually for road shoulders, 
               construction or temporary markings and parking.
               
               Edge lines are located at the edges of a road, whether there is a median or pavement or not.
               """
    elif index < 320:
        return "Road markings: Lane dividers", \
               """ 
               Road markings in Malaysia primarily use thermoplastic and are white. Yellow markings are usually for road shoulders, 
               construction or temporary markings and parking.
               
               Lane dividers divide road into lanes according to its designated width.
               """
    elif index < 330:
        return "Road markings: Directional markings", \
               """ 
               Road markings in Malaysia primarily use thermoplastic and are white. Yellow markings are usually for road shoulders, 
               construction or temporary markings and parking.

               Directional marking consists of arrows and lettering on the road.
               """
               
    elif index < 341:
        return "Road markings: Other types", \
               """ 
               Road markings in Malaysia primarily use thermoplastic and are white. Yellow markings are usually for road shoulders, 
               construction or temporary markings and parking.
               """
    elif index < 342:
        return "Bridge-related sign", "These signs are usually found on bridges."
    elif index < 345:
        return "River sign", "These signs are usually found on bridges."
    elif index < 348:
        return "Highway concessionaires border limit", "Many expressways/highways has a border limit."
    elif index < 351:
        return "Traffic Light codes", "Many traffic lights have codes in them."
    else:
        return "Other", ""

In [174]:
wiki_df3 = wiki_df.copy()

# Apply the custom mapping function to create the 'Road Sign Type' column
wiki_df3['Road Sign Type'], wiki_df3['Road Sign Description'] = zip(*wiki_df3.index.map(map_road_sign_type))

In [175]:
wiki_df3.head(3)

Unnamed: 0,image_href,image_title,high_res_href,Road Sign Type,Road Sign Description
0,https://en.wikipedia.org/wiki/File:Jkr-ft1.svg,Federal roads route code shield,https://upload.wikimedia.org/wikipedia/commons...,State,State roads use letters that correspond to eac...
1,https://en.wikipedia.org/wiki/File:Jkr-k602.svg,State roads route code shield,https://upload.wikimedia.org/wikipedia/commons...,State,State roads use letters that correspond to eac...
2,https://en.wikipedia.org/wiki/File:E12-LLM.png,Expressway (toll road) route code shield,https://upload.wikimedia.org/wikipedia/commons...,State,State roads use letters that correspond to eac...


In [190]:
print(f"Wiki_df3 Shape: {wiki_df3.shape}")
print("")

wiki_df3.info()

Wiki_df3 Shape: (365, 5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   image_href             365 non-null    object
 1   image_title            359 non-null    object
 2   high_res_href          365 non-null    object
 3   Road Sign Type         365 non-null    object
 4   Road Sign Description  365 non-null    object
dtypes: object(5)
memory usage: 14.4+ KB


### 3.0 Export `wiki_df3` to `.jsonl` format

In [181]:
wiki_df4 = wiki_df3.copy()

In [183]:
# rename columns
wiki_df4.columns = [
    'low_res_image_url',
    'image_description',
    'high_res_image_url',
    'road_sign_type',
    'road_sign_description'
]

wiki_df4.head(3)

Unnamed: 0,low_res_image_url,image_description,high_res_image_url,road_sign_type,road_sign_description
0,https://en.wikipedia.org/wiki/File:Jkr-ft1.svg,Federal roads route code shield,https://upload.wikimedia.org/wikipedia/commons...,State,State roads use letters that correspond to eac...
1,https://en.wikipedia.org/wiki/File:Jkr-k602.svg,State roads route code shield,https://upload.wikimedia.org/wikipedia/commons...,State,State roads use letters that correspond to eac...
2,https://en.wikipedia.org/wiki/File:E12-LLM.png,Expressway (toll road) route code shield,https://upload.wikimedia.org/wikipedia/commons...,State,State roads use letters that correspond to eac...


In [185]:
# drop `low_res_image_url` column
wiki_df4.drop('low_res_image_url', inplace=True, axis=1)

In [187]:
# Convert the DataFrame to a list of dictionaries
data_list = wiki_df4.to_dict(orient='records')

In [188]:
jsonl_file_path = 'wikipedia-malaysian-roadsigns-final.jsonl'

# Write each dictionary to a separate line in the .jsonl file
with open(jsonl_file_path, 'w') as jsonl_file:
    for data_dict in data_list:

        # Use json.dumps to serialize the dictionary to a JSON-formatted string
        json_line = json.dumps(data_dict)

        # Write the JSON-formatted string to the .jsonl file
        jsonl_file.write(json_line + '\n')