<a href="https://colab.research.google.com/github/tazar09/napoleon/blob/main/napoleon_battles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Requirements

In [4]:
from sklearn.datasets import fetch_openml
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import datetime as dt
pd.set_option('display.max_rows', 500)
from urllib.parse import quote, unquote
import requests
from bs4 import BeautifulSoup

#Accessing Napoleon's list of battles.

In [5]:
url_list_of_battles = 'https://en.wikipedia.org/wiki/Military_career_of_Napoleon'

In [6]:
list_of_battles = pd.read_html(url_list_of_battles)[1]
list_of_battles.head()

Unnamed: 0,No,Date,Battle,Conflict,Opponent,Location,Outcome
0,1.0,29 Aug-19 Dec 1793,Siege of Toulon,War of the First Coalition,,French Republic,Victory
1,2.0,24-28 Apr 1794,Saorgio,War of the First Coalition,,Kingdom of Sardinia,Victory
2,3.0,21 September 1794,First Dego,War of the First Coalition,,Kingdom of Sardinia,Victory
3,4.0,5 Oct 1795,13 Vendémiaire,French Revolution,,French Republic,Victory
4,5.0,11-12 Apr 1796,Montenotte,War of the First Coalition,,Kingdom of Sardinia,Victory


In [7]:
url = 'https://en.wikipedia.org/wiki/Military_career_of_Napoleon'

response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

tables = soup.find_all('table')

# Iterate through the tables and look for the one with the specific class
for table in tables:
    if 'wikitable' in table.get('class', []):
        # This is the table with the class 'wikitable'
        # Now you can extract hyperlinks from this table
        hyperlinks = [a['href'] for a in table.find_all('a', href=True)]
        break

In [8]:
words_to_filter = ['Mamluk','Ottoman', 'Mediterranean', 'Sovereign','php?title','File:Flagge', 'Duchy', 'Sweden', 'cite', 'War', 'Kingdom', 'Spain', 'Habsburg', 'Revolution', 'onarch', 'Arm%C3%A9e_des_%C3%89migr%C3%A9s', 'Campaign', 'Empire', 'Russia', 'Hundred']
filter_hyperlinks = [link for link in hyperlinks if not any(word in link for word in words_to_filter)]

In [9]:
print(len(filter_hyperlinks), len(hyperlinks))

81 226


In [10]:
battle_links = ['https://en.wikipedia.org/' + link for link in filter_hyperlinks]
battle_links[0]

'https://en.wikipedia.org//wiki/Siege_of_Toulon_(1793)'

In [42]:
battle_links

['https://en.wikipedia.org//wiki/Siege_of_Toulon_(1793)',
 'https://en.wikipedia.org//wiki/Second_Battle_of_Saorgio_(1794)',
 'https://en.wikipedia.org//wiki/First_Battle_of_Dego',
 'https://en.wikipedia.org//wiki/13_Vend%C3%A9miaire',
 'https://en.wikipedia.org//wiki/Battle_of_Montenotte',
 'https://en.wikipedia.org//wiki/Battle_of_Millesimo',
 'https://en.wikipedia.org//wiki/Second_Battle_of_Dego',
 'https://en.wikipedia.org//wiki/Battle_of_Ceva',
 'https://en.wikipedia.org//wiki/Battle_of_Mondov%C3%AC',
 'https://en.wikipedia.org//wiki/Battle_of_Fombio',
 'https://en.wikipedia.org//wiki/Battle_of_Lodi',
 'https://en.wikipedia.org//wiki/Battle_of_Borghetto',
 'https://en.wikipedia.org//wiki/Siege_of_Mantua_(1796%E2%80%9397)',
 'https://en.wikipedia.org//wiki/Battle_of_Lonato',
 'https://en.wikipedia.org//wiki/Battle_of_Castiglione',
 'https://en.wikipedia.org//wiki/Battle_of_Rovereto',
 'https://en.wikipedia.org//wiki/Battle_of_Bassano',
 'https://en.wikipedia.org//wiki/Second_Battle

In [None]:
url_test_test = 'https://en.wikipedia.org//wiki/Battle_of_Saint-Dizier'
# pd.read_html(url_test_test)[3]

#Create function that converts into the right dataframe format.

In [19]:
def make_dataframe(arg_df):

  result_dic = {}
  result_dic0 = {}
  result_dic1 = {}

  for i in range(3,len(arg_df) - 1):

    if arg_df.iloc[i,0] == arg_df.iloc[i,1]:

      if arg_df.iloc[i,0] in ['Belligerents', 'Commanders and leaders', 'Strength', 'Casualties and losses']:
          key0 = arg_df.iloc[i,0]
          value0 = arg_df.iloc[i+1,0]
          key1 = arg_df.iloc[i,1]
          value1 = arg_df.iloc[i+1,1]
          result_dic0[key0] = value0
          result_dic1[key1] = value1

      else:
          pass

    else:
      key00 = arg_df.iloc[i,0]
      value00 = arg_df.iloc[i,1]
      result_dic[key00] = value00


  row0 = list(result_dic0.values())
  row1 = list(result_dic1.values())
  row00 = list(result_dic.values())

  keyy0 = list(result_dic0.keys())
  keyy1= list(result_dic1.keys())
  keyy00 = list(result_dic.keys())

  df_date_loc_result = pd.DataFrame(columns = ['a', 'b', 'c'], data = [keyy00[:3], row00[:3], row00[:3]]).T.rename(columns = {0: 'What', 1:'First team', 2:'Second team'})
  df_bell_comm_str_cas = pd.DataFrame(columns = ['d', 'e','f', 'g'], data = [keyy0, row0, row1]).T.rename(columns = {0: 'What', 1:'First team', 2:'Second team'})
  df_battle_first = pd.DataFrame(columns = ['What','First team', 'Second team'], data = [['Battle', arg_df.columns[0], arg_df.columns[0]]])

  df_no = pd.concat([df_battle_first, df_date_loc_result,df_bell_comm_str_cas], axis = 0)
  idx = [df_no.iloc[0,1].lower().replace(' ', '_')] * df_no.shape[0]
  df_no.index = idx
  return df_no

#Process each battle URL.

In [13]:
list_of_battles_dataframes = []

for battle in battle_links:

  try:
    print(f'Processed URL: {battle}')
    tables_list = pd.read_html(battle)
    table_battle = pd.DataFrame()

    len0 = tables_list[0].shape[0]
    len1 = tables_list[1].shape[0]
    len2 = tables_list[2].shape[0]
    len3 = tables_list[3].shape[0]

    lungimea = [len0, len1, len2, len3]
    max_index = lungimea.index(max(lungimea))

    table_battle = tables_list[max_index]     #try tables[0]

    list_of_battles_dataframes.append(table_battle)
    print(f"Processed URL: {battle}\n")

  except Exception as e:
    print(f'---------> Error processing the URL: {battle}: {e}')



Processed URL: https://en.wikipedia.org//wiki/Siege_of_Toulon_(1793)
Processed URL: https://en.wikipedia.org//wiki/Siege_of_Toulon_(1793)

Processed URL: https://en.wikipedia.org//wiki/Second_Battle_of_Saorgio_(1794)
Processed URL: https://en.wikipedia.org//wiki/Second_Battle_of_Saorgio_(1794)

Processed URL: https://en.wikipedia.org//wiki/First_Battle_of_Dego
Processed URL: https://en.wikipedia.org//wiki/First_Battle_of_Dego

Processed URL: https://en.wikipedia.org//wiki/13_Vend%C3%A9miaire
Processed URL: https://en.wikipedia.org//wiki/13_Vend%C3%A9miaire

Processed URL: https://en.wikipedia.org//wiki/Battle_of_Montenotte
Processed URL: https://en.wikipedia.org//wiki/Battle_of_Montenotte

Processed URL: https://en.wikipedia.org//wiki/Battle_of_Millesimo
Processed URL: https://en.wikipedia.org//wiki/Battle_of_Millesimo

Processed URL: https://en.wikipedia.org//wiki/Second_Battle_of_Dego
Processed URL: https://en.wikipedia.org//wiki/Second_Battle_of_Dego

Processed URL: https://en.wikip

In [16]:
list_of_battles_dataframes[6]

Unnamed: 0,Second Battle of Dego,Second Battle of Dego.1
0,Part of the Italian campaigns in the War of th...,Part of the Italian campaigns in the War of th...
1,The second battle of Dego,The second battle of Dego
2,"Date14–15 April 1796LocationDego, present-day ...","Date14–15 April 1796LocationDego, present-day ..."
3,Date,14–15 April 1796
4,Location,"Dego, present-day Italy.mw-parser-output .geo-..."
5,Result,French victory
6,Belligerents,Belligerents
7,French Republic,Habsburg monarchy Kingdom of Sardinia
8,Commanders and leaders,Commanders and leaders
9,".mw-parser-output .plainlist ol,.mw-parser-out...",Eugène Argenteau Josef Vukassovich


In [17]:
for i, loop in enumerate(list_of_battles_dataframes):
  print(i, (loop.shape[1] == 2))

0 True
1 True
2 True
3 True
4 True
5 True
6 True
7 True
8 True
9 True
10 True
11 True
12 True
13 True
14 True
15 True
16 True
17 True
18 True
19 True
20 True
21 True
22 True
23 True
24 True
25 True
26 True
27 True
28 True
29 True
30 True
31 True
32 True
33 True
34 True
35 True
36 True
37 True
38 True
39 True
40 True
41 True
42 True
43 True
44 True
45 True
46 True
47 True
48 True
49 True
50 True
51 True
52 True
53 True
54 True
55 True
56 True
57 True
58 True
59 True
60 True
61 True
62 True
63 True
64 True
65 True
66 True
67 True
68 True
69 True
70 True
71 True
72 True
73 True


#testing

In [20]:
list_of_battles_dataframes_processed = [make_dataframe(battle) for battle in list_of_battles_dataframes[:2]]
pd.concat(list_of_battles_dataframes_processed, axis = 0)

Unnamed: 0,What,First team,Second team
siege_of_toulon,Battle,Siege of Toulon,Siege of Toulon
siege_of_toulon,Date,29 August – 19 December 1793,29 August – 19 December 1793
siege_of_toulon,Location,"Toulon, France.mw-parser-output .geo-default,....","Toulon, France.mw-parser-output .geo-default,...."
siege_of_toulon,Result,French Republican victory,French Republican victory
siege_of_toulon,Belligerents,French Republic,French Royalists French Federalists Great Bri...
siege_of_toulon,Commanders and leaders,Jean François Carteaux Jacques François Dugom...,Baron d'Imbert Samuel Hood Charles O'Hara (P...
siege_of_toulon,Strength,"32,000[1]","1,500 8,000 & 37 Ships 7,000 & 32 Ships 6,50..."
siege_of_toulon,Casualties and losses,"1,700 dead or wounded[2] 9 Ships of the Line s...","1,200 killed or wounded 700 killed or wounded..."
battle_of_saorgio,Battle,Battle of Saorgio,Battle of Saorgio
battle_of_saorgio,Date,24 to 28 April 1794,24 to 28 April 1794


In [21]:
make_dataframe(list_of_battles_dataframes[23])

ValueError: ignored

In [22]:
idx = [23, 25, 26, 29, 30, 36, 50]

In [27]:
error_battles = [list_of_battles_dataframes[i] for i in idx]
error_battles[0]

Unnamed: 0,French invasion of Malta,French invasion of Malta.1
0,Part of the Mediterranean campaign of 1798,Part of the Mediterranean campaign of 1798
1,Engraving depicting Malta's capitulation to Na...,Engraving depicting Malta's capitulation to Na...
2,Date10–12 June 1798 (2 days)LocationMalta and ...,Date10–12 June 1798 (2 days)LocationMalta and ...
3,Date,10–12 June 1798 (2 days)
4,Location,"Malta and Gozo.mw-parser-output .geo-default,...."
5,Result,French victory
6,Territorial changes,French occupation of Malta
7,Belligerents,Belligerents
8,France,Malta Knights Hospitaller / Order of St. John
9,Commanders and leaders,Commanders and leaders


In [None]:
url_test_test = 'https://en.wikipedia.org/wiki/French_invasion_of_Malta'

In [40]:
df_test = pd.read_html(url_test_test)
dftest = df_test[0]

In [41]:
make_dataframe(dftest)

ValueError: ignored

In [43]:
dftest

Unnamed: 0,French invasion of Malta,French invasion of Malta.1
0,Part of the Mediterranean campaign of 1798,Part of the Mediterranean campaign of 1798
1,Engraving depicting Malta's capitulation to Na...,Engraving depicting Malta's capitulation to Na...
2,Date10–12 June 1798 (2 days)LocationMalta and ...,Date10–12 June 1798 (2 days)LocationMalta and ...
3,Date,10–12 June 1798 (2 days)
4,Location,"Malta and Gozo.mw-parser-output .geo-default,...."
5,Result,French victory
6,Territorial changes,French occupation of Malta
7,Belligerents,Belligerents
8,France,Malta Knights Hospitaller / Order of St. John
9,Commanders and leaders,Commanders and leaders


In [None]:
def make_dataframe(arg_df):

  result_dic = {}
  result_dic0 = {}
  result_dic1 = {}

  for i in range(3,len(arg_df) - 1):      #skip the first 3 rows

    if arg_df.iloc[i,0] == arg_df.iloc[i,1]:  #these rows have data underneath and must be reshaped for collection

      if arg_df.iloc[i,0] in ['Belligerents', 'Commanders and leaders', 'Strength', 'Casualties and losses']:

          key0 = arg_df.iloc[i,0]
          value0 = arg_df.iloc[i+1,0]

          key1 = arg_df.iloc[i,1]
          value1 = arg_df.iloc[i+1,1]

          result_dic0[key0] = value0
          result_dic1[key1] = value1

      else:
          pass

    else:                                 #if not in that list this mean that the data is structured linearly and must be collected differently
      key00 = arg_df.iloc[i,0]
      value00 = arg_df.iloc[i,1]
      result_dic[key00] = value00


  row0 = list(result_dic0.values())
  row1 = list(result_dic1.values())
  row00 = list(result_dic.values())

  keyy0 = list(result_dic0.keys())
  keyy1= list(result_dic1.keys())
  keyy00 = list(result_dic.keys())

  df_date_loc_result = pd.DataFrame(columns = ['a', 'b', 'c'], data = [keyy00[:3], row00[:3], row00[:3]]).T.rename(columns = {0: 'What', 1:'First team', 2:'Second team'})
  df_bell_comm_str_cas = pd.DataFrame(columns = ['d', 'e','f', 'g'], data = [keyy0, row0, row1]).T.rename(columns = {0: 'What', 1:'First team', 2:'Second team'})
  df_battle_first = pd.DataFrame(columns = ['What','First team', 'Second team'], data = [['Battle', arg_df.columns[0], arg_df.columns[0]]])

  df_no = pd.concat([df_battle_first, df_date_loc_result,df_bell_comm_str_cas], axis = 0)
  idx = [df_no.iloc[0,1].lower().replace(' ', '_')] * df_no.shape[0]
  df_no.index = idx
  return df_no

#testing

In [28]:
errors = []
for i, abc in enumerate(list_of_battles_dataframes):
  try:
    make_dataframe(abc)
    # print(f'processed dataframe: No. {i}')
  except Exception as e:
    print(f'----Error at dataframe No. {i}')
    errors.append(i)
print(f'\n {errors}')

----Error at dataframe No. 23
----Error at dataframe No. 25
----Error at dataframe No. 26
----Error at dataframe No. 29
----Error at dataframe No. 30
----Error at dataframe No. 36
----Error at dataframe No. 50

 [23, 25, 26, 29, 30, 36, 50]


In [None]:
# concat_list = []
# for datafr in list_of_df:
#   concat_list.append(datafr)
# df_partial = pd.concat(concat_list, axis = 0)
# pd.set_option('display.max_rows', 500)
# df_partial

In [None]:
# from urllib.parse import quote
# enconded_url = quote(url1_battles[3], safe = ':/')
# enconded_url

# df = pd.read_html(url_1)[0]
# make_dataframe(df)

In [None]:
def make_dataframe(arg_df):

  result_dic = {}
  result_dic0 = {}
  result_dic1 = {}

  for i in range(3,len(arg_df) - 1):

    if arg_df.iloc[i,0] == arg_df.iloc[i,1]:

      if arg_df.iloc[i,0] in ['Belligerents', 'Commanders and leaders', 'Strength', 'Casualties and losses']:
          key0 = arg_df.iloc[i,0]
          value0 = arg_df.iloc[i+1,0]
          key1 = arg_df.iloc[i,1]
          value1 = arg_df.iloc[i+1,1]
          result_dic0[key0] = value0
          result_dic1[key1] = value1

      else:
          pass

    else:
      key00 = arg_df.iloc[i,0]
      value00 = arg_df.iloc[i,1]
      result_dic[key00] = value00


  row0 = list(result_dic0.values())
  row1 = list(result_dic1.values())
  row00 = list(result_dic.values())

  keyy0 = list(result_dic0.keys())
  keyy1= list(result_dic1.keys())
  keyy00 = list(result_dic.keys())

  df_date_loc_result = pd.DataFrame(columns = ['a', 'b', 'c'], data = [keyy00[:3], row00[:3], row00[:3]]).T.rename(columns = {0: 'What', 1:'First team', 2:'Second team'})
  df_bell_comm_str_cas = pd.DataFrame(columns = ['d', 'e','f', 'g'], data = [keyy0, row0, row1]).T.rename(columns = {0: 'What', 1:'First team', 2:'Second team'})
  df_battle_first = pd.DataFrame(columns = ['What','First team', 'Second team'], data = [['Battle', arg_df.columns[0], arg_df.columns[0]]])

  df_no = pd.concat([df_battle_first, df_date_loc_result,df_bell_comm_str_cas], axis = 0)
  idx = [df_no.iloc[0,1].lower().replace(' ', '_')] * df_no.shape[0]
  df_no.index = idx
  return df_no

In [None]:
pd.read_html('https://en.wikipedia.org/wiki/Ligny_(1815)')[1]

In [None]:
def try_read_html(url):
  try:
    table = pd.read_html(url)[0]
    return table
  except Exception as e:
    print(f'Error reading from {url}: {e}')
    return None

In [None]:
for url in url_battles:
  tables = try_read_html(url)
  if tables is not None:
    print (f'Tables extracted from {url}')