# Lecture 3: Visualization

## Zhentao Shi


<img src="graph/Minard.png" width="1000">


## Graphs

* "One picture is worth ten thousand words".
* Modern graphs: web-based, interactive.

* Academia
* Journalism: Economist, SCMP, ...

In [None]:
import numpy as np  
import pandas as pd  
import matplotlib.pyplot as plt

In [None]:
# Read the CSV file
d0 = pd.read_csv("data_example/AJR.csv")

# Plot the data
plt.scatter(d0['avexpr'], d0['logpgp95'])
plt.xlabel('average exploitation')
plt.ylabel('log gdp per capita 1995')
plt.show()

In [None]:

# Read the CSV file
bank_0 = pd.read_csv("data_example/bank-full.csv", sep=";")

# Display the dataframe
print(bank_0)

# Print the names of the columns
print(bank_0.columns)

In [None]:
# Scatter plot (improved x-axis formatting)
plt.figure(figsize=(8,5))
plt.scatter(bank_0['age'], bank_0['balance'])

plt.xlabel('Age', fontsize=12, weight='bold')
plt.ylabel('Balance', fontsize=12, weight='bold')

# Set x-ticks to every 5 years for readability
plt.show()

In [None]:
# Scatter plot with groups
import seaborn as sns
sns.scatterplot(data=bank_0, x='age', y='balance', hue='education', alpha=0.5)
# use hue to give different colors
plt.show()

In [None]:
# Beautified FacetGrid with clearer group labels and counts
import seaborn as sns
sns.set_theme(style='whitegrid', palette='muted')

# Create FacetGrid with fixed size
g = sns.FacetGrid(bank_0, col='education', row='marital', height=3.5, aspect=1.2, sharex=True, sharey=True)
g.map_dataframe(sns.scatterplot, x='age', y='balance', color='steelblue', alpha=0.6, s=20)

# Set clear axis labels
g.set_axis_labels('Age', 'Balance')
# Make facet titles more descriptive
g.set_titles(col_template='Education: {col_name}', row_template='Marital: {row_name}')

# Annotate each facet with sample size
for i, row_val in enumerate(g.row_names):
    for j, col_val in enumerate(g.col_names):
        ax = g.axes[i, j]
        count = len(bank_0[(bank_0['marital']==row_val) & (bank_0['education']==col_val)])
        ax.text(0.02, 0.95, f'n={count}', transform=ax.transAxes, ha='left', va='top', fontsize=9, bbox=dict(facecolor='white', alpha=0.6, edgecolor='none'))

# Improve x-axis text: consistent ticks and font size
xmin = int(bank_0['age'].min())
xmax = int(bank_0['age'].max())
xticks = list(range(xmin, xmax+1, 5))
for axrow in g.axes:
    for ax in axrow:
        ax.set_xticks(xticks)
        for label in ax.get_xticklabels():
            label.set_fontsize(9)
            label.set_rotation(0)

# Add overall title and adjust layout
g.fig.suptitle('Balance vs Age â€” by Education and Marital Status', fontsize=16)
g.fig.subplots_adjust(top=0.90)

plt.tight_layout()
plt.show()

In [None]:
# Bar plot with 'education' as hue (improved x-axis text)

sns.set_theme(style='whitegrid')
plt.figure(figsize=(10,5))

ax = sns.countplot(data=bank_0, x='age', hue='education', palette='muted')

ax.set_xlabel('Age', fontsize=12, weight='bold')
ax.set_ylabel('Count', fontsize=12, weight='bold')

# Choose x-ticks every 5 years for readability
xmin = int(bank_0['age'].min())
xmax = int(bank_0['age'].max())
xticks = list(range(xmin, xmax+1, 5))

ax.set_xticks(xticks)
ax.tick_params(axis='x', labelsize=10, rotation=0, pad=6)
# Slightly lighter x-label text weight for a clean look

plt.legend(title='Education', fontsize=9, title_fontsize=10, bbox_to_anchor=(1.02,1), loc='upper left')
plt.tight_layout()
plt.show()

### Prepare Data for Graphs

In [None]:
# Read the CSV file
d0 = pd.read_csv("data_example/PWT100.csv")

# Display the first few rows of the dataframe
print(d0.head())

# Print the names of the columns
print(d0.columns)

In [None]:
# Select specific columns and filter rows
d1 = d0[['countrycode', 'year', 'rgdpe', 'pop']]
d1 = d1[d1['countrycode'].isin(['CHN', 'RUS', 'JPN', 'USA'])]

# Create new column 'gdpcapita'
d1['gdpcapita'] = d1['rgdpe'] / d1['pop']

# Print the dataframe
print(d1)

In [None]:
# Scatter plot with 'countrycode' as hue
sns.scatterplot(data=d1, x='year', y='rgdpe', hue='countrycode')
plt.show()

In [None]:
# Line plot with 'countrycode' as hue
sns.lineplot(data=d1, x='year', y='gdpcapita', hue='countrycode')
plt.show()

In [None]:
# Select specific columns
s1 = d1[['countrycode', 'year', 'pop']]

# Spread 'year' column into multiple columns with 'pop' as values
s1 = s1.pivot(index='countrycode', columns='year', values='pop')
# index is for row
# columns for column
# value for the entries

print(s1)

In [None]:
# Gather '1950' to '2019' columns into key-value pairs
s1 = s1.reset_index().melt(id_vars='countrycode', var_name='year', value_name='pop')
# set "year" as values, and it will repeat

print(s1)

## Interactive Graphs

* [Plotly Express](https://plotly.com/graphing-libraries/)
  * [Youtube](https://www.youtube.com/watch?v=_b2KXL0wHQg)


In [None]:
import pandas as pd
import plotly.express as px

d0 = pd.read_csv("data_example/AJR.csv")
fig = px.scatter(d0, x='avexpr', y='logpgp95', title='Scatter plot')
fig.show()




* Shinny for Python [posit](https://shiny.posit.co/py/docs/overview.html): [gallary](https://shiny.posit.co/py/gallery/)
* [Shinny Express](https://shiny.posit.co/blog/posts/shiny-express/)
  * Demo: [Shenzhen housing](https://zhentao-shi.shinyapps.io/ShenzhenHousing-Shiny/)
  * Web scrapper: `data_example/Scrape_Lianjia.ipynb`
