<a href="https://www.kaggle.com/code/yutodennou/eda-worldwide-epidemics-cycle?scriptVersionId=148884099" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# 🗃️Import Libs

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

# 🗂️Import Data

In [2]:
df = pd.read_csv("/kaggle/input/list-of-epidemics-and-pandemics-in-world-history/Chronological Table of Epidemic and Pandemic Events in human History.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Event,Date,Location,Disease,Death toll (estimate),Ref.
0,0,1350 BC plague of Megiddo,1350 BC,"Megiddo, land of Canaan","Amarna letters EA 244, Biridiya, mayor of Megi...",Unknown,[26]
1,1,Plague of Athens,430–426 BC,"Greece, Libya, Egypt, Ethiopia","Unknown, possibly typhus, typhoid fever or vir...","75,000–100,000",[27][28][29][30]
2,2,412 BC epidemic,412 BC,"Greece (Northern Greece, Roman Republic)","Unknown, possibly influenza",Unknown,[31]
3,3,Antonine Plague,165–180 (possibly up to 190),Roman Empire,"Unknown, possibly smallpox",5–10 million,[32][33]
4,4,Jian'an Plague,217,Han Dynasty,"Unknown, possibly typhoid fever or viral hemor...",Unknown,[34][35]


# 🔍EDA

## 📏Preprocessing

### Date

In [3]:
df.Date.unique()

array(['1350 BC', '430–426 BC', '412 BC', '165–180 (possibly up to 190)',
       '217', '249–262', '541–549', '590', '627–628', '638–639',
       '664–689', '698–701', '735–737', '746–747', '1346–1353',
       '1485–1551', '1489', '1510', '1519–1520', '1545–1548', '1557–1559',
       '1561–1562', '1563–1564', '1576–1580', '1582–1583', '1592–1596',
       '1592–1593', '1596–1602', '1600–1650', '1603', '1616–1620',
       '1629–1631', '1632–1635', '1633–1634', '1634–1640', '1636–1637',
       '1633–1644', '1647–1652', '1648', '1656–1658', '1663–1664',
       '1665–1666', '1668', '1675–1676', '1676–1685', '1677–1678', '1679',
       '1681', '1687', '1693', '1699', '1702', '1702–1703', '1707–1709',
       '1710–1712', '1713–1715', '1720–1722', '1721–1722', '1730',
       '1732–1733', '1733', '1735–1741', '1738', '1738–1739', '1741',
       '1743', '1759', '1760', '1762', '1763', '1770–1772', '1772',
       '1772–1773', '1775–1776', '1775–1782', '1778', '1788', '1789–1790',
       '1793', '

In [4]:
df['Date'] = df['Date'].replace('2021-2022', '2021–2022')

In [5]:
def get_start_date(date):
    if 'BC' in date:
        _date = date.split(' BC')[0]
        if '–' in _date:
            _date = int(_date.split('–')[0])*(-1)
        else:
            _date = int(_date)*(-1)
    elif ('BC' not in date) and ('–' in date):
        _date = int(date.split('–')[0])
    else:
        _date = int(date)
    return _date

In [6]:
def get_end_date(date):
    if 'BC' in date:
        _date = date.split(' BC')[0]
        if '–' in _date:
            _date = int(_date.split('–')[1][0:3])*(-1)
        else:
            _date = int(_date[0:3])*(-1)
    elif ('BC' not in date) & ('–' in date):
        _date = date.split('–')[1]
        if _date=='present':
            _date = 2023
        else:
            _date = int(_date[0:4])
    else:
        _date = int(date)
    return _date

In [7]:
df['start_date'] = df['Date'].apply(get_start_date)
df['end_date'] = df['Date'].apply(get_end_date)

### Location

In [8]:
countries_series = map(lambda s: s.split(', '), df.Location.unique())
countries = set([x for y in countries_series for x in y])
countries

{'Africa',
 'Americas',
 'Amsterdam',
 'Angola',
 'Angola and Democratic Republic of the Congo',
 'Argentina',
 'Asia',
 'Asia-Pacific',
 'Augsburg',
 'Australia',
 'Austria',
 'Balkans',
 'Bangladesh',
 'Barcelona',
 'Bilad al-Sham',
 'Bolivia',
 'Bombay',
 'Boston',
 'Brazil',
 'Britain (England) and later continental Europe',
 'British Isles',
 'British North America',
 'Buenos Aires',
 'Byzantine Empire',
 'Cambodia',
 'Canada',
 'Canada and United States',
 'Cartagena',
 'Central America',
 'Ceylon',
 'Chad',
 'Charleston',
 'Charleston and Philadelphia',
 'Chile',
 'China',
 'Colombia',
 'Congo Basin',
 'Copenhagen',
 'Croydon',
 'Cuba',
 'Czech Kingdom',
 'Cádiz',
 'Darfur',
 'Democratic Republic of the Congo',
 'Democratic Republic of the Congo and Uganda',
 'Denmark',
 'Dominican Republic',
 'East Africa',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'England',
 'Ethiopia',
 'Eurasia and North Africa',
 'Europe',
 'Europe and West Asia',
 'Fiji',
 'Flint',
 'France',
 'Fremantle',
 

### Extract dataframe of Worldwide Diseases

In [9]:
df_world = df[df['Location'].str.contains('Worldwide')]
df_world

Unnamed: 0.1,Unnamed: 0,Event,Date,Location,Disease,Death toll (estimate),Ref.,start_date,end_date
101,101,1847–1848 influenza epidemic,1847–1848,Worldwide,Influenza,Unknown,[143],1847,1848
104,104,Third cholera pandemic,1846–1860,Worldwide,Cholera,1 million+,[145],1846,1860
109,109,Third plague pandemic,1855–1960,Worldwide,Bubonic plague,12–15 million (India and China),[150][151],1855,1960
127,127,1889–1890 pandemic,1889–1890,Worldwide,Influenza or Human coronavirus OC43 / HCoV-OC4...,1 million,[169],1889,1890
140,140,1915 encephalitis lethargica pandemic,1915–1926,Worldwide,Encephalitis lethargica,500000,[182][183][184],1915,1926
142,142,1918 influenza pandemic ('Spanish flu'),1918–1920,Worldwide,Influenza A virus subtype H1N1 H1N1 virus,17–100 million,[186][187][188],1918,1920
147,147,1929–1930 psittacosis pandemic,1929–1930,Worldwide,Psittacosis,100+,[193],1929,1930
155,155,1957–1958 influenza pandemic ('Asian flu'),1957–1958,Worldwide,Influenza A virus subtype H2N2,1–4 million,[186][199][200],1957,1958
157,157,Seventh cholera pandemic,1961–1975,Worldwide,Cholera (El Tor strain),"36,000[citation needed]",[202],1961,1975
158,158,Hong Kong flu,1968–1970,Worldwide,Influenza A virus subtype H3N2 H3N2 virus,1–4 million,[186][199][200],1968,1970


## 📈Visualize

### Histogram of the Epidemic Cycle

In [10]:
fig = px.histogram(df_world, x="start_date", 
                   nbins=100, 
                   hover_name="Event", 
                   hover_data=["Date","Disease"], 
                   labels={"start_date": "year"},
                   title="When Epidemics Happened")
fig.update_layout(xaxis={'dtick':5})
fig.show()

In [11]:
cycle = abs(df_world.start_date.diff())
print(cycle.describe())
fig = px.histogram(cycle, 
                   nbins=30, 
                   labels={"value": "Year Cycle of Epidemics"},
                   title="Histogram of the Epidemic Cycle")
fig.show()

count    20.000000
mean      8.850000
std      10.126957
min       0.000000
25%       2.000000
50%       4.000000
75%       9.500000
max      34.000000
Name: start_date, dtype: float64


### How Long the Worldwide Epidemics Continued

In [12]:
continuity = abs(df_world.end_date - df_world.start_date)
print(continuity.describe())
fig = px.histogram(continuity, 
                   nbins=100 , 
                   labels={"value": "How Long Epidemic Continued"},
                   title="Histogram of Epidemics Continuity"
                  )
fig.show()

count     21.000000
mean      10.428571
std       23.610530
min        1.000000
25%        1.000000
50%        2.000000
75%        9.000000
max      105.000000
dtype: float64


# 🌞Summary

* An Average of epidemic cycle is about 8 years.  
* Almost of all epidemics continued for less 1 year.  