# Web Scraping

In [23]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

#### Extracting one dynasty

In [24]:
URL = "https://en.wikipedia.org/wiki/Timeline_of_the_Han_dynasty"
page = requests.get(URL)

In [11]:
soup = BeautifulSoup(page.content, "html.parser")

In [7]:
tables = soup.find_all("table", class_="wikitable")

In [8]:
tables[1]

<table class="wikitable" style="width:100%;">
<tbody><tr>
<th style="width:6%">Year</th>
<th style="width:10%">Date</th>
<th>Event
</th></tr>
<tr>
<td>197 BC</td>
<td></td>
<td>The <a href="/wiki/Xiongnu" title="Xiongnu">Xiongnu</a> invade <a href="/wiki/Dai_Commandery" title="Dai Commandery">Dai Commandery</a> with the help of <a href="/wiki/Chen_Xi_(rebel)" title="Chen Xi (rebel)">Chen Xi</a> and <a class="mw-redirect" href="/wiki/Xin,_King_of_Han" title="Xin, King of Han">Han Xin</a><sup class="reference" id="cite_ref-FOOTNOTEChang2007143_4-0"><a href="#cite_note-FOOTNOTEChang2007143-4">[4]</a></sup>
</td></tr>
<tr>
<td rowspan="2" style="vertical-align:top;">196 BC</td>
<td></td>
<td><a href="/wiki/Emperor_Gaozu_of_Han" title="Emperor Gaozu of Han">Emperor Gaozu of Han</a> replaces nine of the ten <a href="/wiki/Kings_of_the_Han_dynasty" title="Kings of the Han dynasty">Kings of the Han dynasty</a> with his brothers and sons<sup class="reference" id="cite_ref-FOOTNOTETwitchett20081

In [25]:
data = []
count = 0
previous_year_value = -1
old_year_value = -1
def add_row_data(table):
    row_data = {}
    for tr in table.tbody.find_all("tr"):
        td = tr.find_all("td")
        data_row_length = len(td)  
        # there is two cases: data rows is 3 or 2 (where the column is expanded)
        if data_row_length == 3:
            # td[0].getText() is the value from the year column
            # td[2].text is the value from the event column
            # we can skip the date column
            year = td[0].getText().replace("\\n", "").strip()
            if year == "":
                year = previous_year_value
            previous_year_value = year
            sup = td[2].find("sup")
            if sup:
                sup.extract()
            content = td[2].text.replace("\\n", "").strip()
            row_data[year] = []
            row_data[year].append(content)
            old_year_value = td[0].text
            
        elif data_row_length == 2: 
            # old_year_value will have the value of the latest key from row_data
            # so we can do this directly
            sup = td[1].find("sup")
            if sup:
                sup.extract()
            content = td[1].text.replace("\\n", "").strip()
            row_data[old_year_value].append(content)
            previous_year_value = old_year_value
    return row_data

In [26]:
def convert_year_period(year):
    addition = {
        "1": "st century",
        "2": "nd century",
        "3": "rd century",
        "4": "th century"
    }
    process_year = year.split(" ")
    # len(process_year) == 2 => Always BC
    # len(process_year) == 1 => It is always AD
    century = str(int(process_year[0]) // 100 + 1)
    get_addition = addition.get(century[-1]) if century[-1] == "1" or century[-1] == "2" or century[-1] == "3" else addition["4"]
    
    if len(process_year) == 2:
        return str(century) + get_addition + " " + "BC"
    elif len(process_year) == 1:
        return str(century) + get_addition

In [27]:
add_row_data(tables[6])

{'1636': ['Hong Taiji proclaims the Qing dynasty',
  'Qing invasion of Joseon: Hong Taiji invades Joseon'],
 '1637': ['Qing invasion of Joseon: Joseon is defeated and becomes a Qing tributary'],
 '1638': ['Qing dynasty conquers Shandong'],
 '1639': ['Qing dynasty attacks the Daur and Solon people']}

In [12]:
# titles: remember that now titles is a list of title such as 3rd Century BC
data_dict = {
    "year": [],
    "description": [],
    "period": [],
    "current_country": [],
    "country": []
}

COUNTRY = "China"
country_name = "Han"
idx_title = 0
for table in tables:
    table_data = add_row_data(table)
    for key in table_data:
        for content in table_data[key]:
            data_dict["year"].append(key)
            data_dict["description"].append(content)
            data_dict["period"].append(convert_year_period(key))
            data_dict["current_country"].append(COUNTRY)
            data_dict["country"].append(country_name)
    idx_title += 1

In [13]:
df = pd.DataFrame(data_dict)
df

Unnamed: 0,year,description,period,current_country,country
0,202 BC,Liu Bang becomes emperor of the Han dynasty (p...,3rd century BC,China,Han
1,202 BC,Emperor Gaozu of Han moves the capital from Lu...,3rd century BC,China,Han
2,201 BC,Battle of Baideng: Emperor Gaozu of Han's army...,3rd century BC,China,Han
3,201 BC,"Xin, King of Han defects to the Xiongnu",3rd century BC,China,Han
4,197 BC,The Xiongnu invade Dai Commandery with the hel...,2nd century BC,China,Han
...,...,...,...,...,...
248,219,Lü Meng's invasion of Jing Province: Sun Quan'...,3rd century,China,Han
249,220,Guan Yu is executed by Sun Quan,3rd century,China,Han
250,220,Cao Cao dies at Luoyang and is succeeded by hi...,3rd century,China,Han
251,220,The Nine-rank system is implemented,3rd century,China,Han


In [14]:
df[df["year"]=="65 BC"]

Unnamed: 0,year,description,period,current_country,country
97,65 BC,Han forces under Feng Fenshi force the king of...,1st century BC,China,Han
98,65 BC,Han vassalizes Qiuci,1st century BC,China,Han
99,65 BC,The Qiang revolt in eastern Tibet,1st century BC,China,Han


In [15]:
convert_year_period("202 BC")

'3rd century BC'

In [16]:
df.isnull().sum()

year               0
description        0
period             0
current_country    0
country            0
dtype: int64

In [17]:
# df.to_csv("china.csv")

### Testing with another dynasty

In [34]:
URL = "https://en.wikipedia.org/wiki/Timeline_of_the_Sui_dynasty"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
tables = soup.find_all("table", class_="wikitable")

In [35]:
data_dict = {
    "year": [],
    "description": [],
    "period": [],
    "current_country": [],
    "country": []
}

COUNTRY = "China"
country_name = "Sui"
idx_title = 0
for table in tables:
    table_data = add_row_data(table)
    for key in table_data:
        for content in table_data[key]:
            data_dict["year"].append(key)
            data_dict["description"].append(content)
            data_dict["period"].append(convert_year_period(key))
            data_dict["current_country"].append(COUNTRY)
            data_dict["country"].append(country_name)
    idx_title += 1

In [20]:
df = pd.DataFrame(data_dict)
df

Unnamed: 0,year,description,period,current_country,country
0,581,Yang Jian (Emperor Wen of Sui) replaces the No...,6th century,China,Sui
1,582,Emperor Xuan of Chen dies and is succeeded by ...,6th century,China,Sui
2,583,Emperor Wen of Sui moves into Daxingcheng (Xi'...,6th century,China,Sui
3,584,Digs the Guangtong Canal,6th century,China,Sui
4,587,Annexes Western Liang,6th century,China,Sui
5,588,Launches expedition against the Chen dynasty,6th century,China,Sui
6,589,Takes Jiankang and annexes the Chen dynasty; s...,6th century,China,Sui
7,590,Yang Su crushes rebellions in annexed Chen ter...,6th century,China,Sui
8,592,Emperor Wen of Sui sends out commissioners to ...,6th century,China,Sui
9,593,The Cuanman rebel in Yunnan,6th century,China,Sui


### To events.csv

In [28]:
['Japan', 'Joseon', 'Goryeo', 'Han', 'Sui', 'Tang', 'Song', 'Ming', 'Qing']

['Japan', 'Joseon', 'Goryeo', 'Han', 'Sui', 'Tang', 'Song', 'Ming', 'Qing']

In [29]:
dynasties = {
    "3": "https://en.wikipedia.org/wiki/Timeline_of_the_Han_dynasty",
    "4": "https://en.wikipedia.org/wiki/Timeline_of_the_Sui_dynasty",
    "5": "https://en.wikipedia.org/wiki/Timeline_of_the_Tang_dynasty",
    "6": "https://en.wikipedia.org/wiki/Timeline_of_the_Song_dynasty#:~:text=The%20Song%20dynasty%20is%20commonly,dynasty%20(1115%E2%80%931234).",
    "7": "https://en.wikipedia.org/wiki/Timeline_of_the_Ming_dynasty",
    "8": "https://en.wikipedia.org/wiki/Timeline_of_the_Qing_dynasty"   
}

In [30]:
data_dict = {
    "year": [],
    "description": [],
    "period": [],
    "dynasty_id": []
    }

for dynasty in dynasties:
    URL = dynasties[dynasty]
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    tables = soup.find_all("table", class_="wikitable")
    idx_title = 0
    for table in tables:
        table_data = add_row_data(table)
        for key in table_data:
            for content in table_data[key]:
                data_dict["year"].append(key)
                data_dict["description"].append(content)
                data_dict["period"].append(convert_year_period(key))
                data_dict["dynasty_id"].append(dynasty)
        idx_title += 1

In [31]:
df = pd.DataFrame(data_dict)
df

Unnamed: 0,year,description,period,dynasty_id
0,202 BC,Liu Bang becomes emperor of the Han dynasty (p...,3rd century BC,3
1,202 BC,Emperor Gaozu of Han moves the capital from Lu...,3rd century BC,3
2,201 BC,Battle of Baideng: Emperor Gaozu of Han's army...,3rd century BC,3
3,201 BC,"Xin, King of Han defects to the Xiongnu",3rd century BC,3
4,197 BC,The Xiongnu invade Dai Commandery with the hel...,2nd century BC,3
...,...,...,...,...
1847,1907,The territories of Manchuria are reorganized i...,20th century,8
1848,1907,Anhui governor Enming is assassinated by the a...,20th century,8
1849,1907,Empress Dowager Cixi declares her intention to...,20th century,8
1850,1907,An edict is passed to disband provincial banne...,20th century,8


In [32]:
df.to_csv("china.csv", mode='a', header=False)

In [33]:
df.to_csv("events.csv", mode='a', header=False)