# Web Scraping

### To events.csv

In [1]:
# !pip3 install beautifulsoup4

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [3]:
# https://realpython.com/beautiful-soup-web-scraper-python/

In [4]:
URL = "https://en.wikipedia.org/wiki/Timeline_of_Japanese_history"
page = requests.get(URL)

In [5]:
soup = BeautifulSoup(page.content, "html.parser")

In [6]:
h2_components = soup.find_all("h2")

In [7]:
h2_components[0]

<h2><span class="mw-headline" id="Paleolithic">Paleolithic</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Timeline_of_Japanese_history&amp;action=edit&amp;section=1" title="Edit section: Paleolithic">edit</a><span class="mw-editsection-bracket">]</span></span></h2>

In [8]:
titles = []
for h2 in h2_components:
    title_element = h2.find("span", class_="mw-headline").text
    titles.append(title_element)

AttributeError: 'NoneType' object has no attribute 'text'

In [9]:
titles

['Paleolithic',
 '3rd century BC',
 '1st century',
 '2nd century',
 '3rd century',
 '4th century',
 '5th century',
 '6th century',
 '7th century',
 '8th century',
 '9th century',
 '10th century',
 '11th century',
 '12th century',
 '13th century',
 '14th century',
 '15th century',
 '16th century',
 '17th century',
 '18th century',
 '19th century',
 '20th century',
 '21st century',
 'See also',
 'References and notes',
 'Further reading',
 'External links']

In [10]:
titles.remove('See also')
titles.remove('References and notes')
titles.remove('Further reading')
titles.remove('External links')

In [11]:
len(titles)

23

In [12]:
# Events

In [13]:
tables = soup.find_all("table", class_="wikitable")

In [14]:
tables[1]

<table class="wikitable" width="100%">
<tbody><tr>
<th style="width:6%">Year</th>
<th style="width:10%">Date</th>
<th>Event
</th></tr>
<tr>
<td><a href="/wiki/300_BC" title="300 BC">300 BC</a></td>
<td></td>
<td>Mass migration from the Asian continent to the Japanese archipelago ushered in the <a href="/wiki/Yayoi_period" title="Yayoi period">Yayoi period</a>. Japan transitioned from a hunter-gatherer to a settled agricultural society. There was a mixture between the Yayoi immigrants and the indigenous population, and between new cultural influences and existing practices.
</td></tr></tbody></table>

In [15]:
data = []
count = 0
old_year_value = -1
def add_row_data(table):
    row_data = {}
    for tr in table.tbody.find_all("tr"):
        td = tr.find_all("td")
        data_row_length = len(td)  
        # there is two cases: data rows is 3 or 2 (where the column is expanded)
        if data_row_length == 3:
            # td[0].getText() is the value from the year column
            # td[2].text is the value from the event column
            # we can skip the date column
            row_data[td[0].getText().replace("\\n", "").strip()] = []
            row_data[td[0].getText().replace("\\n", "").strip()].append(td[2].text.replace("\\n", "").strip())
            old_year_value = td[0].text
        elif data_row_length == 2: 
            # old_year_value will have the value of the latest key from row_data
            # so we can do this directly
            row_data[old_year_value].append(td[1].text.replace("\\n", "").strip())
    return row_data

In [24]:
# titles: remember that now titles is a list of title such as 3rd Century BC
data_dict = {
    "year": [],
    "description": [],
    "period": [],
#     "current_country": [],
#     "country": []
    "dynasty_id": []
}

# COUNTRY = "Japan"
# country_name = "Japan"
idx_title = 0
for table in tables:
    table_data = add_row_data(table)
    for key in table_data:
        for content in table_data[key]:
            data_dict["year"].append(key)
            data_dict["description"].append(content)
            data_dict["period"].append(titles[idx_title])
            data_dict["dynasty_id"].append("0")
#             data_dict["current_country"].append(COUNTRY)
#             data_dict["country"].append(country_name)
    idx_title += 1

In [25]:
add_row_data(tables[6])

{'404': ['Goguryeo–Wa conflicts between Wa, Baekje, and Gaya against Goguryeo and Silla'],
 '413': ['King of Wa sends 1st recorded tribute to the Jin.'],
 '430': ['Yamato polity become a regional power after subjugating several states in West Japan. Details are subject to Mimana controversy.'],
 '461': ['Chronology of the Japanese historical records become consistent. All dates before this entry are reconstructed with foreign or archaeological data.',
  'Baekje sends an embassy to Japan, as confirmed by both Japanese and Korean records.']}

In [26]:
df = pd.DataFrame(data_dict)
df

Unnamed: 0,year,description,period,dynasty_id
0,14000 BC,First settlers arrived in the Japanese archipe...,Paleolithic,0
1,300 BC,Mass migration from the Asian continent to the...,3rd century BC,0
2,57,The King of Na gold seal is issued by Emperor ...,1st century,0
3,180,"The Civil war of Wa ends, bringing Shaman quee...",2nd century,0
4,201,"The Nagata Shrine, Hirota Shrine and Ikuta Shr...",3rd century,0
...,...,...,...,...
305,2018,The tourist boom in Japan reach unprecedented ...,21st century,0
306,2019,Emperor Akihito abdicated being the first Japa...,21st century,0
307,2020,The COVID-19 pandemic in Japan begins and the ...,21st century,0
308,2021,2020 Summer Olympics are held in Japan.,21st century,0


In [27]:
df[(df["year"] == "461") | (df["year"] == "1192") | (df["year"] == "1854") | (df["year"] == "1863")]

Unnamed: 0,year,description,period,dynasty_id
16,461,Chronology of the Japanese historical records ...,5th century,0
17,461,"Baekje sends an embassy to Japan, as confirmed...",5th century,0
102,1192,Kamakura became the de facto capital of Japan ...,12th century,0
103,1192,Minamoto no Yoritomo seized power from the cen...,12th century,0
203,1854,Second Visit. Matthew C. Perry returns to Japa...,19th century,0
204,1854,Matthew C. Perry signs the Convention of Kanag...,19th century,0
205,1854,The Ansei great earthquakes series starts with...,19th century,0
213,1863,Order to expel barbarians,19th century,0
214,1863,Battle of Shimonoseki Straits,19th century,0
215,1863,Bombardment of Kagoshima,19th century,0


In [28]:
df.tail()

Unnamed: 0,year,description,period,dynasty_id
305,2018,The tourist boom in Japan reach unprecedented ...,21st century,0
306,2019,Emperor Akihito abdicated being the first Japa...,21st century,0
307,2020,The COVID-19 pandemic in Japan begins and the ...,21st century,0
308,2021,2020 Summer Olympics are held in Japan.,21st century,0
309,2022,Former Prime Minister Shinzo Abe is assassinat...,21st century,0


In [29]:
df.isnull().sum()

year           0
description    0
period         0
dynasty_id     0
dtype: int64

In [30]:
df.to_csv("japan.csv")

In [32]:
df.to_csv("events.csv")