# Data Visualization

In [None]:
# uv venv 
# uv pip install pandas matplotlib seaborn scipy scikit-learn statsmodels palmerpenguins pip ipykernel ipywidgets setuptools jinja2 vega_datasets

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy, sklearn, statsmodels
from palmerpenguins import load_penguins

## Data loading
Read the data in the `gapminder.csv` file.

In [None]:
gapminder = pd.read_csv("gapminder.csv")
gapminder

## Line plot
Generate a line plot of the life expectancy, over time, for a country of your choice. 
<br>What do you notice?

In [None]:
gapminder['country'].unique()

In [None]:
i = gapminder['country'] == 'United Kingdom'
gapminder[i].plot( 'year', 'lifeExp' )
#plt.plot( gapminder[i]['year'], gapminder[i]['lifeExp'] )

Generate a line plot of the world population over time.
<br>What do you notice?

In [None]:
population = gapminder.groupby('year')['pop'].sum()
population.plot()
#plt.plot( population.index, population )

## Scatter plot

Generate a scatter plot of the life expectancy (vertical axis) versus the GDP per capita (horizontal axis).
<br>What do you notice?
<br>What happens if you use `plt.plot` instead of `plt.scatter`?

In [None]:
gapminder.plot.scatter( 'gdpPercap', 'lifeExp' )
#plt.scatter( gapminder['gdpPercap'], gapminder['lifeExp'] )

In [None]:
gapminder.plot.scatter( 'gdpPercap', 'lifeExp' )
plt.xscale('log')

In [None]:
gapminder.plot( 'gdpPercap', 'lifeExp' )

## Histogram
Plot a histogram, and the density of the life expectancy for the whole dataset
<br>What happens if you omit density=True?

In [None]:
gapminder['lifeExp'].plot.hist(density=True)
gapminder['lifeExp'].plot.density()

In [None]:
gapminder['lifeExp'].plot.hist()
gapminder['lifeExp'].plot.density()

Generate histograms of the life expectancy for the first and last dates of the data. 
<br>
How do they differ?

In [None]:
first_date = gapminder['year'].min()
last_date  = gapminder['year'].max()
i1 = gapminder['year'] == first_date
i2 = gapminder['year'] == last_date
gapminder['lifeExp'][i1].plot.hist( density = True )
gapminder['lifeExp'][i2].plot.hist( density = True )

gapminder['lifeExp'][i1].plot.density()
gapminder['lifeExp'][i2].plot.density()

Generate a histogram of the population, for the whole dataset. 
<br>
How does it compare with the previous histogram?

In [None]:
gapminder['pop'].plot.hist()

## Barplot
Generate a barchart of the population by continent, for the last date available

In [None]:
last_date = gapminder['year'].max()
i = gapminder['year'] == last_date
population = gapminder[i].groupby('continent')['pop'].sum()
population = population.sort_values()
population.plot.barh()

Generate a barchart of the population by country, for the last date available.

In [None]:
gapminder[i].plot.barh( 'country', 'pop' )

In [None]:
gapminder[i].sort_values('pop').tail(20).plot.barh( 'country', 'pop' )

## Nicer plots
Pick one of the earlier plots, label the axes, and add a title.
<br>Change the colour and thickness of the lines. 
<br>Change the size, shape and opacity of the marker
<br>Add a legend

In [None]:
first_date = gapminder['year'].min()
last_date  = gapminder['year'].max()
i1 = gapminder['year'] == first_date
i2 = gapminder['year'] == last_date

fig, axs = plt.subplots( 1, 2, figsize = (8,4), layout = 'constrained', dpi = 300 )

gapminder['lifeExp'][i1].plot.hist( density = True, ax = axs[0] )
gapminder['lifeExp'][i2].plot.hist( density = True, ax = axs[0] )
gapminder['lifeExp'][i1].plot.density( ax = axs[0] )
gapminder['lifeExp'][i2].plot.density( ax = axs[0] )

axs[1].hist( gapminder['lifeExp'][i1], density = True, facecolor = 'lightblue', edgecolor = 'tab:blue', alpha = .5 )
axs[1].hist( gapminder['lifeExp'][i2], density = True, facecolor = 'pink', edgecolor = 'tab:red', alpha = .5 )
gapminder['lifeExp'][i1].plot.density( color = 'tab:blue', linewidth = 5, ax = axs[1], label = first_date )
gapminder['lifeExp'][i2].plot.density( color = 'tab:red', linewidth = 5, ax = axs[1], label = last_date )
axs[1].legend()
axs[1].set_xlabel( "Life expectancy (years)" )
axs[1].set_ylabel(None)
axs[1].set_yticks([])
for side in ['left', 'top', 'right']:
    axs[1].spines[side].set_visible(False)

axs[0].set_title( "Before" )
axs[1].set_title( "After" )

plt.show()

## Pair plot
Generate a pairplot for the data
<br>Do you notice anything?

In [None]:
# INSERT CODE

## Extra exercise
Find other interesting visualizations of this dataset

In [None]:
# INSERT CODE

## Extra exercise
Find interesting visualizations of the penguin dataset. 



In [None]:
penguins = load_penguins()
penguins

In [None]:
# INSERT CODE