![image](https://www.hamiltonbook.com/media/image_full/1975250A.JPG)

### In this project, we'll work with data from the CIA World Factbook, which has statistics about all of the countries on Earth. The Factbook contains demographic information like:

    * population - The population as of 2023.
    * population_growth - The annual population growth rate
    * migration_rate - The annual rate of migration

In [1]:
import mysql.connector as sql

In [2]:
#create a connection to the db
conn = sql.connect(host = 'localhost', user = 'root', password = '', database = 'factbook')

In [3]:
#create cursor
cur = conn.cursor()

#show the tables
cur.execute('SHOW TABLES')

for x in cur:
    print(x)

('fact_table',)
('facts',)
('net_immigration',)
('population',)
('population_growth',)
('the_facts',)


In [4]:
#to show the structure of the tables
import pandas as pd

immigration = pd.read_sql_query('select * from net_immigration',conn)
pop = pd.read_sql_query('select * from population',conn)
pop_growth = pd.read_sql_query('select * from population_growth',conn)

In [5]:
immigration.head(2)

Unnamed: 0,name,slug,value,date_of_information,ranking,region
0,Syria,syria,45.78,2023 est.,1,Middle East
1,South Sudan,south-sudan,19.98,2023 est.,2,Africa


In [6]:
pop.head(2)

Unnamed: 0,name,slug,value,date_of_information,ranking,region
0,China,china,1413142846,2023 est.,1,East and Southeast Asia
1,India,india,1399179585,2023 est.,2,South Asia


In [7]:
#change population value from string to int
pop.value = pop.value.str.replace(',','')
pop.value = pop.value.apply(pd.to_numeric, errors='coerce')

In [8]:
pop_growth.head(2)

Unnamed: 0,name,slug,value,date_of_information,ranking,region
0,Syria,syria,6.39,2023 est.,1,Middle East
1,South Sudan,south-sudan,4.78,2023 est.,2,Africa


In [9]:
# join the three datasets to get one

query4 = '''SELECT p.name AS country,p.value AS population,p.ranking AS population_ranking,
            p.region AS region,n.value AS migration_rate,n.ranking AS migration_ranking,
            g.value AS population_growth,g.ranking AS growth_ranking
            FROM 
            population p
            JOIN population_growth g on p.name = g.name
            JOIN net_immigration n on n.name = p.name
            '''
df= pd.read_sql_query(query4,conn)
df.head()

Unnamed: 0,country,population,population_ranking,region,migration_rate,migration_ranking,population_growth,growth_ranking
0,Syria,22933531,59,Middle East,45.78,1,6.39,1
1,South Sudan,12118379,80,Africa,19.98,2,4.78,2
2,Niger,25396840,56,Africa,-0.62,129,3.66,3
3,Burundi,13162952,77,Africa,6.84,12,3.59,4
4,Equatorial Guinea,1737695,154,Africa,13.01,5,3.36,5


In [10]:
#change population from string to int
df.population = df.population.str.replace(',','')
df.population = df.population.apply(pd.to_numeric, errors='coerce')

# save the joined dataset to the database to work with it


In [11]:
cur.execute('DROP table fact_table')
conn.commit()

In [12]:
import pymysql
from sqlalchemy import create_engine

user = 'root'
password = ''
host = '127.0.0.1'
port = 3306
database = 'factbook'

engine = create_engine("mysql+pymysql://" + user + ":" + password + "@" + host + ":" + str(port) + "/" + database)

df.to_sql('fact_table', con = engine, if_exists = 'replace', index=False)

## Some analysis

In [13]:
#checking the maximum and minimum populations
query5 = '''SELECT MIN(population) AS min_pop,
            MAX(population) AS max_pop,
            MIN(population_growth) AS min_pop_growth,
            MAX(population_growth) max_pop_growth 
            FROM fact_table;
            '''
df1= pd.read_sql_query(query5,conn)
df1

Unnamed: 0,min_pop,max_pop,min_pop_growth,max_pop_growth
0,1440,1413142846,-2.31,6.39


In [14]:
#information about country with the max population
query6 = '''SELECT * 
            FROM fact_table
            WHERE population = (SELECT MAX(population) FROM fact_table)
            '''
df2= pd.read_sql_query(query6,conn)
df2

Unnamed: 0,country,population,population_ranking,region,migration_rate,migration_ranking,population_growth,growth_ranking
0,China,1413142846,1,East and Southeast Asia,-0.11,104,0.18,181


In [15]:
#information about top 10 countries in population
query7 = '''SELECT *
            FROM fact_table
            ORDER BY population DESC LIMIT 10
            '''
df3= pd.read_sql_query(query7,conn)
df3

Unnamed: 0,country,population,population_ranking,region,migration_rate,migration_ranking,population_growth,growth_ranking
0,China,1413142846,1,East and Southeast Asia,-0.11,104,0.18,181
1,India,1399179585,2,South Asia,0.12,79,0.7,127
2,United States,339665118,3,North America,3.01,39,0.68,129
3,Indonesia,279476346,4,East and Southeast Asia,-0.71,133,0.76,118
4,Pakistan,247653551,5,South Asia,-1.01,148,1.91,44
5,Nigeria,230842743,6,Africa,-0.21,112,2.53,20
6,Brazil,218689757,7,South America,-0.19,108,0.64,134
7,Bangladesh,167184465,8,South Asia,-2.88,180,0.91,99
8,Russia,141698923,9,Central Asia,1.7,53,-0.24,213
9,Mexico,129875529,10,North America,-0.77,136,0.61,141


In [16]:
#information about bottom 10 countries in population
query8 = '''SELECT *
            FROM fact_table
            ORDER BY population LIMIT 10
            '''
df4= pd.read_sql_query(query8,conn)
df4

Unnamed: 0,country,population,population_ranking,region,migration_rate,migration_ranking,population_growth,growth_ranking
0,Paracel Islands,1440,234,East and Southeast Asia,-0.66,131,0.75,120
1,Tokelau,1647,233,Australia and Oceania,-3.84,191,-0.01,198
2,Svalbard,2926,229,Europe,-5.57,207,-0.03,200
3,Saint Pierre and Miquelon,5195,227,North America,-7.12,216,-1.19,234
4,Montserrat,5440,226,Central America and the Caribbean,0.0,93,0.46,155
5,Saint Barthelemy,7093,225,Central America and the Caribbean,-1.27,152,-0.11,205
6,"Saint Helena, Ascension, and Tristan da Cunha",7935,224,Africa,0.0,96,0.11,186
7,Cook Islands,7939,223,Australia and Oceania,-26.2,230,-2.31,237
8,Nauru,9852,222,Australia and Oceania,-10.15,222,0.42,160
9,Tuvalu,11639,221,Australia and Oceania,-6.36,212,0.81,111


In [17]:
#top populated countries per region
query9= '''WITH new_table AS(
            SELECT country,region,population,
            row_number() OVER(
            PARTITION BY region ORDER BY population DESC) AS row_num
            FROM fact_table)
            SELECT country,region,population
            FROM
            new_table
            WHERE row_num=1
            ORDER BY population DESC
            '''
df5 = pd.read_sql_query(query9, conn)
df5

Unnamed: 0,country,region,population
0,China,East and Southeast Asia,1413142846
1,India,South Asia,1399179585
2,United States,North America,339665118
3,Nigeria,Africa,230842743
4,Brazil,South America,218689757
5,Russia,Central Asia,141698923
6,Iran,Middle East,87590873
7,Germany,Europe,84220184
8,Australia,Australia and Oceania,26461166
9,Guatemala,Central America and the Caribbean,17980803


In [18]:
#bottom populated countries per region
query10 = '''WITH new_table AS(
            SELECT country,region,population,
            row_number() OVER(
            PARTITION BY region ORDER BY population) AS row_num
            FROM fact_table)
            SELECT country,region,population
            FROM
            new_table
            WHERE row_num=1
            ORDER BY population
            '''
df6= pd.read_sql_query(query10,conn)
df6

Unnamed: 0,country,region,population
0,Paracel Islands,East and Southeast Asia,1440
1,Tokelau,Australia and Oceania,1647
2,Svalbard,Europe,2926
3,Saint Pierre and Miquelon,North America,5195
4,Montserrat,Central America and the Caribbean,5440
5,"Saint Helena, Ascension, and Tristan da Cunha",Africa,7935
6,Maldives,South Asia,389568
7,Suriname,South America,639759
8,Bahrain,Middle East,1553886
9,Turkmenistan,Central Asia,5690818


In [19]:
#countries with highest migration rates
query11 = '''SELECT country, population, migration_rate
            FROM fact_table
            ORDER BY migration_rate DESC LIMIT 10
            '''
df7 = pd.read_sql_query(query11,conn)
df7

Unnamed: 0,country,population,migration_rate
0,Syria,22933531,45.78
1,South Sudan,12118379,19.98
2,Venezuela,30518260,13.55
3,British Virgin Islands,39369,13.21
4,Equatorial Guinea,1737695,13.01
5,Cayman Islands,65483,12.34
6,Luxembourg,660924,11.39
7,Monaco,31597,10.89
8,Anguilla,19079,10.48
9,Turks and Caicos Islands,59367,8.42


In [20]:
#countries with highest population growth rates
query12 = '''SELECT country, population, population_growth
            FROM fact_table
            ORDER BY population_growth DESC LIMIT 10
            '''
df8 = pd.read_sql_query(query12,conn)
df8

Unnamed: 0,country,population,population_growth
0,Syria,22933531,6.39
1,South Sudan,12118379,4.78
2,Niger,25396840,3.66
3,Burundi,13162952,3.59
4,Equatorial Guinea,1737695,3.36
5,Angola,35981281,3.34
6,Benin,14219908,3.31
7,Uganda,47729952,3.22
8,"Congo, Democratic Republic of the",111859928,3.13
9,Chad,18523165,3.05


#### The top two countries in migration rate also have the highest population growth rates
#### All the top ten countries with highest growth rates are in Africa except Syria

In [21]:
# calculating the population density(first get the size of the countries)
area = pd.read_csv('area.csv')
area.head()

Unnamed: 0,name,slug,value,date_of_information,ranking,region
0,Russia,russia,17098242,,1,Central Asia
1,Antarctica,antarctica,14200000,,2,Antarctica
2,Canada,canada,9984670,,3,North America
3,United States,united-states,9833517,,4,North America
4,China,china,9596960,,5,East and Southeast Asia


In [22]:
#turn the area values into numeric 
area.value = area.value.str.replace(',','')
area.value =area.value.apply(pd.to_numeric, errors='coerce')

#take only needed columns
area = area[['name','value','ranking']]

#rename columns
area.rename(columns={'name':'country','ranking':'area_rank','value':'country_size'},inplace=True)
area.head()

Unnamed: 0,country,country_size,area_rank
0,Russia,17098242,1
1,Antarctica,14200000,2
2,Canada,9984670,3
3,United States,9833517,4
4,China,9596960,5


In [23]:
# combine with the population dataframe(pop)
pop.rename(columns ={'name':'country'}, inplace=True)
combined = pd.merge(pop,area, on='country')
combined.head()

Unnamed: 0,country,slug,value,date_of_information,ranking,region,country_size,area_rank
0,China,china,1413142846,2023 est.,1,East and Southeast Asia,9596960,5
1,India,india,1399179585,2023 est.,2,South Asia,3287263,8
2,United States,united-states,339665118,2023 est.,3,North America,9833517,4
3,Indonesia,indonesia,279476346,2023 est.,4,East and Southeast Asia,1904569,16
4,Pakistan,pakistan,247653551,2023 est.,5,South Asia,796095,37


In [24]:
#change country_size from string to numbers
import numpy as np
combined.country_size = combined.country_size.astype('int')

combined['density']= combined.value.div(combined.country_size).replace(np.inf,0)

In [25]:
# top 10 densely populated countries
combined.nlargest(10,'density')

Unnamed: 0,country,slug,value,date_of_information,ranking,region,country_size,area_rank,density
168,Macau,macau,639971,2023 est.,169,East and Southeast Asia,28,236,22856.107143
215,Monaco,monaco,31597,2023 est.,216,Europe,2,254,15798.5
112,Singapore,singapore,5975383,2023 est.,113,East and Southeast Asia,719,190,8310.685675
104,Hong Kong,hong-kong,7288167,2023 est.,105,East and Southeast Asia,1108,183,6577.768051
150,Gaza Strip,gaza-strip,2037744,2023 est.,151,Middle East,360,206,5660.4
216,Gibraltar,gibraltar,29629,2023 est.,217,Europe,7,244,4232.714286
154,Bahrain,bahrain,1553886,2023 est.,155,Middle East,760,187,2044.586842
173,Malta,malta,467138,2023 est.,174,Europe,316,208,1478.28481
201,Bermuda,bermuda,72576,2023 est.,202,North America,54,230,1344.0
209,Sint Maarten,sint-maarten,45677,2023 est.,210,Central America and the Caribbean,34,235,1343.441176


In [26]:
# bottom 10 densely populated countries
combined.nsmallest(10,'density')

Unnamed: 0,country,slug,value,date_of_information,ranking,region,country_size,area_rank,density
234,Holy See (Vatican City),holy-see-vatican-city,1000,2022 est.,235,Europe,0,257,0.0
205,Greenland,greenland,57777,2023 est.,206,North America,2166086,13,0.026673
228,Svalbard,svalbard,2926,January 2021 est.,229,Europe,62045,125,0.047159
227,Falkland Islands (Islas Malvinas),falkland-islands-islas-malvinas,3662,2021 est.,228,South America,12173,163,0.30083
236,Pitcairn Islands,pitcairn-islands,50,2021 est.,237,Australia and Oceania,47,233,1.06383
132,Mongolia,mongolia,3255468,2023 est.,133,East and Southeast Asia,1564116,20,2.081347
140,Namibia,namibia,2777232,2023 est.,141,Africa,824292,35,3.369233
53,Australia,australia,26461166,2023 est.,54,Australia and Oceania,7741220,7,3.418217
176,Iceland,iceland,360872,2023 est.,177,Europe,103000,108,3.503612
165,Guyana,guyana,791739,2023 est.,166,South America,214969,85,3.683038
