In [1]:
# Copied this code cell from https://www.kaggle.com/code/shtrausslearning/bayesian-regression-house-price-prediction
from IPython.core.display import display, HTML, Javascript

color_map = ['#FFFFFF','#FF5733']

prompt = color_map[-1]
main_color = color_map[0]
strong_main_color = color_map[1]
custom_colors = [strong_main_color, main_color]

css_file = '''
div #notebook {
background-color: white;
line-height: 20px;
}

#notebook-container {
%s
margin-top: 2em;
padding-top: 2em;
border-top: 4px solid %s;
-webkit-box-shadow: 0px 0px 8px 2px rgba(224, 212, 226, 0.5);
    box-shadow: 0px 0px 8px 2px rgba(224, 212, 226, 0.5);
}

div .input {
margin-bottom: 1em;
}

.rendered_html h1, .rendered_html h2, .rendered_html h3, .rendered_html h4, .rendered_html h5, .rendered_html h6 {
color: %s;
font-weight: 600;
}

div.input_area {
border: none;
    background-color: %s;
    border-top: 2px solid %s;
}

div.input_prompt {
color: %s;
}

div.output_prompt {
color: %s; 
}

div.cell.selected:before, div.cell.selected.jupyter-soft-selected:before {
background: %s;
}

div.cell.selected, div.cell.selected.jupyter-soft-selected {
    border-color: %s;
}

.edit_mode div.cell.selected:before {
background: %s;
}

.edit_mode div.cell.selected {
border-color: %s;

}
'''

def to_rgb(h): 
    return tuple(int(h[i:i+2], 16) for i in [0, 2, 4])

main_color_rgba = 'rgba(%s, %s, %s, 0.1)' % (to_rgb(main_color[1:]))
open('notebook.css', 'w').write(css_file % ('width: 95%;', main_color, main_color, main_color_rgba, 
                                            main_color,  main_color, prompt, main_color, main_color, 
                                            main_color, main_color))

def nb(): 
    return HTML("<style>" + open("notebook.css", "r").read() + "</style>")
nb()

  from IPython.core.display import display, HTML, Javascript


<font size="+3" color="#000000"><b>1 <span style='color:#4285f4; font-weight:bold'>|</span> Importing Libraries </b></font><br><a id="1"></a>
- **For Data Processing**: sqlite3, numpy, pandas
- **For Data Visualization**: matplotlib, seaborn, plotly

In [2]:
import numpy as np
import pandas as pd
import sqlite3
import math
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Hide Warnings
import warnings
warnings.filterwarnings("ignore")



<font size="+3" color="#000000"><b>2 <span style='color:#4285f4; font-weight:bold'>|</span> Reading the Dataset </b></font><br><a id="1"></a>

In [3]:
conn = sqlite3.connect('/kaggle/input/world-data/world.sqlite')
df = pd.read_sql_query("SELECT * FROM world", conn)
df = pd.DataFrame(df)
numeric_columns = [
    'Density','ArmedForcesSize','CallingCode','Co2-Emissions',
 'Population','MinimumWage','LandArea','MaternalMortalityRatio',
 'LifeExpectancy','UnemploymentRate','Urban_population','Longitude',
 'AgriculturalLand','BirthRate','ForestedArea','CPI','CPIChange',
 'FertilityRate','GrossPrimaryEducationEnrollment','Latitude',
 'GrossTertiaryEducationEnrollment','GDP','GasolinePrice',
 'PopulationLaborForceParticipation','TaxRevenue','TotalTaxRate',
 'InfantMortality','OutOfPocketHealthExpenditure','PhysiciansPerThousand',
]
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric)

In [4]:
display(df.head(6).T)
print(f'shape: {df.shape[0]} rows by {df.shape[1]} columns')

Unnamed: 0,0,1,2,3,4,5
Country,Afghanistan,Albania,Algeria,Angola,Argentina,Armenia
Density,60,105,18,26,17,104
Abbreviation,AF,AL,DZ,AO,AR,AM
AgriculturalLand,58.1,43.1,17.4,47.5,54.3,58.9
LandArea,652230,28748,2381741,1246700,2780400,29743
ArmedForcesSize,323000,9000,317000,117000,105000,49000
BirthRate,32.49,11.78,24.28,40.73,17.02,13.99
CallingCode,93,355,213,244,54,374
Capital,Kabul,Tirana,Algiers,Luanda,Buenos Aires,Yerevan
Co2-Emissions,8672,4536,150006,34693,201348,5156


shape: 107 rows by 35 columns


GDP per Capita can be incredibly useful to find patterns and draw insights from data, so I'm creating a new column for the GDP per Capita for each country.  
If you don't know what this is, here's a simple explaination I found on the internet:  
**GDP per Capita:** This is the total income of a country, divided by the number of people living in that country. GDP per capita shows how much money people make on an average by working in that country.

In [5]:
df['GDP_per_Capita'] = df['GDP'] / df['Population']

<font size="+2" color="#000000"><b style="font-weight:normal">2.1 <span style='color:#4285f4; font-weight:normal'>|</span> Dataset Statistics </b></font><br><a id="1"></a>

In [6]:
df.describe()[1:].T.style.background_gradient(cmap='Blues', axis=1)

Unnamed: 0,mean,std,min,25%,50%,75%,max
Density,151.635514,212.896617,2.0,37.0,93.0,152.5,1380.0
AgriculturalLand,42.445794,19.683459,2.6,28.45,43.1,56.95,82.6
LandArea,891201.457944,2265370.752326,316.0,65455.0,238391.0,664404.0,17098240.0
ArmedForcesSize,188514.018692,432523.101455,1000.0,12000.0,40000.0,179000.0,3031000.0
BirthRate,20.372056,10.114282,6.4,10.95,17.95,28.5,46.08
CallingCode,352.878505,342.030858,1.0,85.0,243.0,461.0,1876.0
Co2-Emissions,222764.439252,994956.552811,495.0,6885.0,24796.0,100476.0,9893038.0
CPI,162.974393,130.434858,101.87,115.91,133.61,166.8,1344.19
CPIChange,4.838318,8.799063,-3.2,1.4,2.7,4.75,53.5
FertilityRate,2.694766,1.319017,0.98,1.665,2.22,3.535,6.91


<font size="+2" color="#000000"><b style="font-weight:normal">2.2 <span style='color:#4285f4; font-weight:normal'>|</span> Correlation Matrix </b></font><br><a id="1"></a>

In [7]:
fig = px.imshow(df[numeric_columns+["GDP_per_Capita"]].corr(),color_continuous_scale="Blues")
fig.update_layout(height=1000)
fig.show()

<font size="+3" color="#000000"><b>3 <span style='color:#4285f4; font-weight:bold'>|</span> Exploratory Analysis </b></font><br><a id="1"></a>

In [8]:
fig = px.scatter(df, x="PhysiciansPerThousand", y="BirthRate", hover_data=df[['Country']])
fig.update_layout(
    title='Scatterplot of Birth Rate & Physicians per Thousand'
)
fig.update_traces(selector=dict(type='scatter'),
                  marker=dict(color='#ea4335'))

trendline = px.scatter(df, x="PhysiciansPerThousand", y="BirthRate",
                       trendline="ols", trendline_options=dict(log_x=True))
trendline.data[1]['line']['color'] = '#5591f5'
fig.add_trace(trendline.data[1])
fig.update_layout(
    template='gridon',
    plot_bgcolor='rgba(0, 0, 0, 0)',
    paper_bgcolor='rgba(0, 0, 0, 0)',
)

fig.update_xaxes(title_text="Physicians per thousand")
fig.update_yaxes(title_text="Birth Rate")

fig.show()

**Birth Rate:**  Birth rate is an estimate of the number of individuals born per year per 1000 in the population.  
**Physicians Per Thousand:** A physician is a general term for a doctor who has earned a medical degree. This variable represents the number of physicians per thousand in the population of a country.

From this graph, we can see that there is a negative relationship between these two variables, and the correlation is strong, with $R^2=0.716194$.

But why?  
Shouldn't there be a positive relationship? If the number of physicians per thousand decreases, then the birth rate should also decrease. However, this isn't the case here. 

In [9]:
fig = make_subplots(rows=1, cols=2)

scatter1 = px.scatter(df, x="PhysiciansPerThousand", y="InfantMortality",
                      trendline="ols", trendline_options=dict(log_x=True), hover_data=df[['Country']])
fig.add_trace(go.Scatter(scatter1.data[0], marker_color='#ea4335'), row=1, col=1)
fig.add_trace(go.Scatter(scatter1.data[1], line=dict(color='#5591f5')), row=1, col=1)

fig.update_xaxes(title_text="Physicians per thousand", row=1, col=1)
fig.update_yaxes(title_text="Infant Mortality", row=1, col=1)

scatter2 = px.scatter(df, x="PhysiciansPerThousand", y="MaternalMortalityRatio",
                      trendline="ols", trendline_options=dict(log_x=True), hover_data=df[['Country']])
fig.add_trace(go.Scatter(scatter2.data[0], marker_color='#34a853'), row=1, col=2)
fig.add_trace(go.Scatter(scatter2.data[1], line=dict(color='#5591f5')), row=1, col=2)

fig.update_xaxes(title_text="Physicians per thousand", row=1, col=2)
fig.update_yaxes(title_text="Maternal Mortality Ratio", row=1, col=2)

fig.update_layout(
    showlegend=False,
    template='plotly_white',
    plot_bgcolor='rgba(0, 0, 0, 0)',
    paper_bgcolor='rgba(0, 0, 0, 0)',
)

fig.show()

**Infant Mortality:** The infant mortality rate is the number of infant deaths for every 1000 live births.  
**Maternal Mortality Ratio:** The maternal mortality ratio is defined as the number of maternal deaths during a given time period per 100,000 live births during the same time period.

This one makes sense. The infant mortality rate and the maternal mortality ratio increases exponentially as the number of physician per thousand decreases.  
The correlation is strong, $R^2 = 0.677729$  
This graph indicates that physicians are important for child births. Without them, the number of infant deaths and maternal deaths increases.  
Then why do countries with a low number of physicians (compared to their population) has a higher birth rate?  
Let's explore and find out!

In [10]:
fig = make_subplots(rows=1, cols=2)

scatter1 = px.scatter(df, x="PhysiciansPerThousand", y="GDP_per_Capita",
                      trendline="ols", trendline_options=dict(log_x=False), hover_data=df[['Country']])
fig.add_trace(go.Scatter(scatter1.data[0], marker_color='#5591f5'), row=1, col=1)
fig.add_trace(go.Scatter(scatter1.data[1], line=dict(color='#ea4335')), row=1, col=1)

fig.update_xaxes(title_text="Physicians per thousand", row=1, col=1)
fig.update_yaxes(title_text="GDP per Capita", row=1, col=1)

scatter2 = px.scatter(df, x="BirthRate", y="GDP_per_Capita",
                      trendline="ols", trendline_options=dict(log_x=True), hover_data=df[['Country']])
fig.add_trace(go.Scatter(scatter2.data[0], marker_color='#34a853'), row=1, col=2)
fig.add_trace(go.Scatter(scatter2.data[1], line=dict(color='#ea4335')), row=1, col=2)

fig.update_xaxes(title_text="Birth Rate", row=1, col=2)
fig.update_yaxes(title_text="GDP per Capita", row=1, col=2)

fig.update_layout(
    showlegend=False,
    template='plotly_white',
    plot_bgcolor='rgba(0, 0, 0, 0)',
    paper_bgcolor='rgba(0, 0, 0, 0)',
)

fig.show()

Now it's making some sense. In poor countries,
- The number of physician per thousand is low.
- The birth rate is high.
- The infant mortality rate and maternal mortality ratio is high.

In poor countries, birth rates are high because children are needed to work and bring in an income for the family. Children are required to look after elderly parents because of a lack of pensions.  
This is why, in poor countries, even though the number of physicians are low, the birth rate is high, leading to a high infant mortality rate and a high maternal mortality ratio.

In [11]:
df['Log_GDP_per_Capita'] = df['GDP_per_Capita'].apply(lambda x: math.log(x))

fig = px.scatter(df, x="BirthRate", y="LifeExpectancy", color="Log_GDP_per_Capita",
                 trendline="ols", color_continuous_scale="sunset", hover_data=df[['Country']])

color_scale_label = "GDP per Capita<br>(log scale)<br> ."
fig.update_coloraxes(colorbar_title=color_scale_label)

color_scale_ticks = [math.log(val) for val in [100, 1000, 10000, 100000]]
color_scale_ticklabels = ["100", "1,000", "10,000", "100,000"]
fig.update_coloraxes(colorbar_ticks="outside", colorbar_tickvals=color_scale_ticks, colorbar_ticktext=color_scale_ticklabels)

fig.update_xaxes(title_text="Birth Rate")
fig.update_yaxes(title_text="Life Expectancy")

fig.update_layout(
    title='Scatterplot of BirthRate & LifeExpectancy',
    template='gridon',
    plot_bgcolor='rgba(0, 0, 0, 0)',
    paper_bgcolor='rgba(0, 0, 0, 0)',
)

fig.show()

There is a negative linear relationship between life expectancy and birth rate. This is because countries with higher birth rates have a lower GDP per Capita, so the people in these countries cannot afford the essential things needed to maintain a good lifestyle, such as healthcare, clean environment etc. Therefore, the life expectancy is low.  
Sierra Leone and Nigeria are the 2 countries with the lowest life expectancies, just 54.3 years!  

In [12]:
fig = px.scatter(df, x="GDP_per_Capita", y="LifeExpectancy",
                 trendline="ols", trendline_options=dict(log_x=True), hover_data=df[['Country']])
fig.update_layout(
    title='Scatterplot of GDP per Capita & Life Expectancy'
)

trendline = px.scatter(df, x="GDP_per_Capita", y="LifeExpectancy",
                       trendline="ols", trendline_options=dict(log_x=True))
trendline.data[1]['line']['color'] = '#ea4335'
fig.add_trace(trendline.data[1])

fig.update_xaxes(title_text="GDP per Capita")
fig.update_yaxes(title_text="Life Expectancy")

fig.update_layout(
    template='gridon',
    plot_bgcolor='rgba(0, 0, 0, 0)',
    paper_bgcolor='rgba(0, 0, 0, 0)',
)

fig.show()

Life expectancy and GDP per Capita have a strong correlation.   
Or in other words, poor countries have a lower life expectancy compared to rich countries.

In [13]:
data = df[['Country', 'GDP_per_Capita']].sort_values('GDP_per_Capita', ascending=False)[:10]
colors = ['#a1c2fa', '#f28e86', '#fde396', '#aedcba', '#a1c2fa', '#f28e86', '#fcd050', '#5db975', '#5591f5', '#ea4335']
fig = go.Figure(go.Bar(
    x=data['GDP_per_Capita'][::-1],
    y=data['Country'][::-1],
    marker=dict(color=colors),
    orientation='h',
    text=data['GDP_per_Capita'].apply(lambda x:"$"+str(round(x/1000))+'k')[::-1],
    textposition='outside',
))
fig.update_layout(
    title='Top 10 Richest Countries in the World (by GDP per Capita)',
    xaxis_title="GDP per Capita",
    template='gridon',
    plot_bgcolor='rgba(0, 0, 0, 0)',
    paper_bgcolor='rgba(0, 0, 0, 0)',
)

fig.show()

Why isn't USA present on this list? The author didn't include USA in this dataset, that's why (if you're the author, it would be great if you could add it to the dataset). Otherwise USA would be second on this list, with a GDP per Capita of 70,000 USD.

In [14]:
data = df[['Country', 'Population']].sort_values('Population', ascending=False)[:10]
colors = ['#a1c2fa', '#f28e86', '#fde396', '#aedcba', '#a1c2fa', '#f28e86', '#fcd050', '#5db975', '#5591f5', '#ea4335']
fig = go.Figure(go.Bar(
            x=data['Population'][::-1],
            y=data['Country'][::-1],
            marker=dict(color=colors),
            text=data['Population'].apply(lambda x:str(int(x/1000000))+"M")[::-1], 
            textposition='outside',
            orientation='h'))
fig.update_layout(
    title='Top 10 Most Populated Countries in the World',
    xaxis_title="Population Size",
    template='gridon',
    plot_bgcolor='rgba(0, 0, 0, 0)',
    paper_bgcolor='rgba(0, 0, 0, 0)',
)

fig.show()

df_sorted = df.sort_values('Population', ascending=False)

top_n = 2
top_countries = df_sorted.head(top_n)

other_population = df_sorted.iloc[top_n:]['Population'].sum()

new_df = pd.concat([top_countries, pd.DataFrame({'Country': ['Others'], 'Population': [other_population]})])

colors = ['#4285f4', '#ea4335', '#34a853']
fig = go.Figure(data=[go.Pie(labels=new_df['Country'],
                             values=new_df['Population'],
                             textinfo="label+percent",
                             pull=[0, 0, 0],
                             showlegend=False,
                             marker_colors=colors,
                            )])


fig.update_traces(
    hole=0.6
)
fig.update_layout(
    title='Population distribution of Countries',
    template='gridon',
    plot_bgcolor='rgba(0, 0, 0, 0)',
    paper_bgcolor='rgba(0, 0, 0, 0)',
    annotations=[
        dict(
            text="Almost Half of the Earth's <br>Population lives in <br> India & China alone!!", 
            x=0.5, y=0.5, 
            font_size=13,
            showarrow=False
        )
    ]
)

fig.show()

In [15]:
fig = make_subplots(rows=1, cols=2)

scatter1 = px.scatter(df, x="Urban_population", y="Co2-Emissions",
                      trendline="ols", trendline_options=dict(log_x=False), hover_data=df[['Country']])
fig.add_trace(go.Scatter(scatter1.data[0], marker_color='#34a853'), row=1, col=1)
fig.add_trace(go.Scatter(scatter1.data[1], line=dict(color='#ea4335')), row=1, col=1)

fig.update_xaxes(title_text="Urban_population", row=1, col=1)
fig.update_yaxes(title_text="Co2-Emissions", row=1, col=1)

scatter2 = px.scatter(df, x="Population", y="Co2-Emissions",
                      trendline="ols", trendline_options=dict(log_x=False), hover_data=df[['Country']])
fig.add_trace(go.Scatter(scatter2.data[0], marker_color='#4285f4'), row=1, col=2)
fig.add_trace(go.Scatter(scatter2.data[1], line=dict(color='#ea4335')), row=1, col=2)

fig.update_xaxes(title_text="Population", row=1, col=2)
fig.update_yaxes(title_text="Co2-Emissions", row=1, col=2)

fig.update_layout(
    showlegend=False,
    template='gridon',
    plot_bgcolor='rgba(0, 0, 0, 0)',
    paper_bgcolor='rgba(0, 0, 0, 0)',
)

fig.show()

Both of these two plots have a strong positive correlation.  
- $R^2$ for urban population = 0.880822
- $R^2$ for population = 0.697078

Nothing unusual here... the greater the size of a country's population, the greater the amount of carbon dioxide emmisions is.  
As the correlation is stronger for urban population, it is a better indicator for the carbon dioxide emmisions of a country.  
If you hover over the regression line on the graphs, you can see the equation of the lines.
- Gradient of regression line for `Urban_population` = 0.00980952
- Gradient of regression line for `Population` = 0.004327502

This means that the carbon dioxide emmisions for a country increases with a higher factor (about 2.3x) for their increase in urban population size compared to their general population size.  
Therefore, as expected, urbanization is more responsible for carbon dioxide emmisions of a country.

In [16]:
data = df[['Country', 'LandArea']].sort_values('LandArea', ascending=False)[:10]
colors = ['#a1c2fa', '#f28e86', '#fde396', '#aedcba', '#a1c2fa', '#f28e86', '#fcd050', '#5db975', '#5591f5', '#ea4335']
fig = go.Figure(go.Bar(
            x=data['LandArea'][::-1],
            y=data['Country'][::-1],
            marker=dict(color=colors),
            orientation='h'))
fig.update_layout(
    title='Top 10 Biggest Countries in the World by Land Area',
    xaxis_title="Land Area in square kilometers",
    template='gridon',
    plot_bgcolor='rgba(0, 0, 0, 0)',
    paper_bgcolor='rgba(0, 0, 0, 0)',
    margin=dict(l=220)  # Adjust the left margin to make room for long labels
)
fig.show()

In [17]:
fig = px.histogram(df, x="FertilityRate", marginal="box", nbins=20, hover_data=df[['Country']])

fig.update_traces(
    boxpoints='all',
    boxmean='sd',
    marker=dict(color='#ea4335'),
    selector=dict(type='box')
)

fig.update_traces(
    marker_color='#4285f4',
    selector=dict(type='histogram'))

fig.update_layout(
    title='Boxplot of Fertility Rate of Countries',
    xaxis_title="Fertility Rate",
    template='gridon',
    plot_bgcolor='rgba(0, 0, 0, 0)',
    paper_bgcolor='rgba(0, 0, 0, 0)',
)

fig.show()

From [Britannica](https://www.britannica.com/topic/fertility-rate),
> The single most important factor in population growth is the <b><mark>total fertility rate</mark></b> (TFR). If, on average, women give birth to <b><mark>2.1</mark></b> children and these children survive to the age of 15, any given woman will have replaced herself and her partner upon death. A TFR of 2.1 is known as the replacement rate. Generally speaking, when the TFR is greater than 2.1, the population in a given area will increase, and when it is less than 2.1, the population in a given area will eventually decrease, though it may take some time because factors such as age structure, emigration, or immigration must be considered. 

From the boxplot:
- Median = $2.22$
- Mean $ \pm $ standard deviation = $2.69\pm 1.31$

Both the mean and median is well above 2.1 (the replacement rate). But can we conclude that this is an evidence that the human civilization is on the path to overpopulation?  
No. We could've draw that conclusion if the population in each country was the same, which isn't true. We need to calculate the average fertility of each person on Earth to draw this conclusion.  
So, we need to weigh the fertility rate of each country by their population size and then calculate the average of the weighed values to calculate the average fertility rate of each human on Earth.  

In [18]:
weighed_values = (df['FertilityRate']*df['Population']).sum()
total_population = df['Population'].sum()
mean = weighed_values/total_population
print(mean)

2.4374499265489247


And we've got our answer!!  
The average fertility rate of each human on Earth is $2.437$, which is higher than the replacement rate ($2.1$)  
Therefore, this is an evidence that we are on the road to population growth.

We can use the data about each country's fertility rate to predict which countries will experience a population growth in the future.  
Let's subtract $2.1$ from fertility rate and call it 'population growth indicator'.  
If this value is more than zero, the country will experience a population growth, otherwise if this value is less than zero the country's population will decrease.  

In [19]:
df['population_growth_indicator'] = df['FertilityRate'] - 2.1
df['population_growth_indicator'] = df['population_growth_indicator'].apply(lambda x:float('{:.3g}'.format(x)))

# Create the bar chart
fig = px.bar(df, x='population_growth_indicator', y='Country', text='population_growth_indicator', orientation='h')

# Set different colors for bars with negative and positive values
fig.update_traces(marker_color=['#ea4335' if val < 0 else '#4285f4' for val in df['population_growth_indicator']],
                  textposition='outside')

# Uniform text settings
fig.update_layout(
    uniformtext_minsize=8,
    uniformtext_mode='hide',
    height=2600,
    title="Population Growth Indicator for Countries",
    template='gridon',
    plot_bgcolor='rgba(0, 0, 0, 0)',
    paper_bgcolor='rgba(0, 0, 0, 0)',
    margin=dict(l=220)  # Adjust the left margin to make room for long labels
)

# Show the figure
fig.show()

You can use this graph to predict which countries will experience a population growth in the next few decades, if the fertility rate remains approximately the same.  
You cannot, however, use this graph to predict which countries will face a <b><mark>overpopulation crisis</mark></b> or an <b><mark>underpopulation crisis</mark></b> because there are other factors to consider. For example, a country might have a high fertility rate, but that doesn't mean they will experience an overpopulation crisis because they might have a high number of old people who will pass away soon or they might have a large land area to cover a large number of people. And this is just one example where land area or the distribution of age of the population is a factor to consider. There can be much, much more factors to consider when making such kind of predictions.

*That's it for now...
Yes I know I haven't done any analysis on some variables, for example:*
- *Agricultural Land*
- *Armed Forces Size*
- *Gasoline Price*
- *Tax Revenue*  
etc...

*I'll do it later when I get some time.  
If you're reading this, and it's been a while since I've published this notebook, it is possible that I forgot to update it. Maybe you can remind me through a comment?  
Thanks for reading by the way. Hope you enjoyed it!*