In [None]:
# Objective: Create pie charts, box plots, scatter plots, and bubble charts

import numpy as np
import pandas as pd


In [None]:
from js import fetch
import io

URL = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DV0101EN-SkillsNetwork/Data%20Files/Canada.xlsx'
resp = await fetch(URL)
text = io.BytesIO((await resp.arrayBuffer()).to_py())

df_can = pd.read_excel(
    text,
    sheet_name='Canada by Citizenship',
    skiprows=range(20),
    skipfooter=2)
print('Data downloaded and read into a dataframe!')

In [None]:
df_can.head()

In [None]:
# clean up the dataset to remove unnecessary columns (eg. REG) 
df_can.drop(['AREA', 'REG', 'DEV', 'Type', 'Coverage'], axis=1, inplace=True)

# let's rename the columns so that they make sense
df_can.rename(columns={'OdName':'Country', 'AreaName':'Continent','RegName':'Region'}, inplace=True)

# for sake of consistency, let's also make all column labels of type string
df_can.columns = list(map(str, df_can.columns))

# set the country name as index - useful for quickly looking up countries using .loc method
df_can.set_index('Country', inplace=True)

# add total column
df_can['Total'] = df_can.sum(axis=1)

# years that we will be using in this lesson - useful for plotting later on
years = list(map(str, range(1980, 2014)))
print('data dimensions:', df_can.shape)

In [None]:
%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.style.use('ggplot') # optional: for ggplot-like style

# check for latest version of Matplotlib
print('Matplotlib version: ', mpl.__version__) # >= 2.0.0

In [None]:
# Pie charts

# Group countries by continents and apply sum() function
df_continents = df_can.groupby('Continent', axis=0).sum()

# Note: The output of the groupby method is a 'groupby' object
# We can not use it further until we apply a function (eg. sum())
print(type(df_can.groupby('Continent', axis=0)))

df_continents.head()

df_continents['Total'].plot(kind='pie', 
                            figsize=(5,6), 
                            autopct='%1.1f%%', # add in percentages
                            startangle=90, # start anglle 90 (Africa)
                            shadow=True, # add shadow
                           )

plt.title('Immigration to Canada by Continent [1980 - 2013]')
plt.axis('equal') # Sets the pie chart to look like a circle.

plt.show()

In [None]:
# The above visual is not very cear, the numbers and text overlap in some instances. Let's make a few modifications to improve
# the visuals

# 1. Remove the text labels on the pie chart by passing in legend and add it as a seperate legend using plt.legend().
# 2. Push out the percentages to sit just outside the pie chart by passing in pctdistance parameter.
# 3. Pass in a custom set of colors for continents by passing in colors parameter.
# 4. Explode the pie chart to emphsize the lowest three continents (Africa, North America, and Latin America, and Caribbean) by 
#    passing in explode parameter.

colors_list = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue', 'lightgreen', 'pink']
explode_list = [0.1, 0, 0, 0, 0.1, 0.1] # ratio for each continent with which to offset set wedge.

df_continents['Total'].plot(kind='pie',
                           figsize=(15,6),
                           autopct='%1.1f%%',
                           startangle=90,
                           shadow=True,
                           labels=None, # turn off labels on pie chart
                           pctdistance=1.12, # the ratio between the center of each pie slice and the start of the text generated by autopct
                           colors=colors_list, # add custom colors
                           explode=explode_list, # 'explode' lowest 3 continents 
                           )

# scale the title up by 12% to match pctdistance
plt.title('Immigration to Canada by Continent [1980 - 2013]', y=1.12)
plt.axis('equal')

# add legend
plt.legend(labels=df_continents.index, loc='upper left')
plt.show()


In [1]:
# Box plots
df_japan = df_can.loc[['Japan'], years].transpose()
df_japan.head()

df_japan.plot(kind='box', figsize(8,6))

plt.title('Box plot of Japanese Immigrants from 1980 - 2013')
plt.ylabel('Number of Immigrants')

plt.show()

NameError: name 'df_can' is not defined

In [None]:
# horizontal box plots
df_CI.plot(kind='box', figsize=(10, 7), color='blue', vert=False)

plt.title('Box plots of Immigrants from China and India (1980 - 2013)')
plt.xlabel('Number of Immigrants')

plt.show()

In [None]:
# Subplots
# To create multiple plots within the same figure. 
# TO visualize multiple plots togther, we can create a figure (overall canvas) and divide it into subplots, each containing
# a plot. With subplots, we usually work with the artist layer instead of the scripting layer.

# Step 1: Create a figure
fig = plt.figure()

# Step 2: Create subplots
ax0 = fig.add_subplot(1, 2, 1) # add subplot 1 (1 row, 2 columns, first plot)
ax1 = fig.add_subplot(1, 2, 2) # add subplot 2 (1 row, 2 columns, second plot)

# Subplot 1: Box plot
df_CI.plot(kind='box', color='blue', vert=False, figsize=(20,6), ax=ax0) # add to subplot 1
ax0.set_title('Box Plots of Immigrants from China and India (1980 - 2013)')
ax0.set_xlabel('Number of Immigrants')
ax0.set_ylabel('Countries')

# Subplot 2: Line plot
df_CI.plot(kind='line', figssize=(20,6), ax=ax1) # add to subplot 2
ax1.set_title('Line Plots of Immigrants from China and India (1980 - 2013)')
ax1.set_ylabel('Number of Immigrants')
ax1.set_xlabel('Years')

plt.show()



In [None]:
# Question: Create a box plot to visualiz the distribution of the top 15 countries (based on total immigration) grouped by the
# decades 1980s, 1990s, 2000s.

df_top15 = df_can.sort_values(['Total'], ascending=False, axis=0).head(15)

# Cretae a list of all years in decades 80's, 90's, and 00's
years_80s = list(map(str, range(1980, 1990)))
years_90s = list(map(str, range(1990, 2000)))
years_00s = list(map(str, range(2000, 2010)))

# Slice the original dataframe df_can to create a series for each decade
df_80s = df_top15.loc[:, years_80s].sum(axis=1)
df_90s = df_top15.loc[:, years_90s].sum(axis=1) 
df_00s = df_top15.loc[:, years_00s].sum(axis=1)

new_df = pd.DataFrame({'1980s' : df_80s, '1990s': df_90s, '2000s': df_00s})
new_df.head()



In [None]:
# Scatter plots

# Visualize the trend of total immigration to Canada (all countries combined) for the years 1980 - 2013
# Step 1: Get the dataset.
df_tot = pd.DataFrame(df_can[years].sum(axis=0))

# change the years to type int (useful for regression later on)
df_tot.index = map(int, df_tot.index)

# reset the index to put it back in as a column in the df_tot dataframe
df_tot.reset_index(inplace = True)

# Rename columns
df_tot.columns = ['year', 'total']

# View the final dataframe
df_tot.head()


In [None]:
# Step 2: Plot the data
df_tot.plot(kind='scatter', x='year', y='total', figsize=(10,6), color='darkblue')

plt.title('Total Immigration to Canada from 1980 - 2013')
plt.xlabel('Year')
plt.ylabel('Number of Immigrants')

plt.show

In [None]:
# Add regression line
x = df_tot['year'] # year on x-axis
y = df_tot['total'] # total on y-axis
fit = np.polyfit(x, y, deg=1)

fit

In [None]:
# Plot the regression line on the scatter plot
df_tot.plot(kind='scatter', x='year', y='total', figsize=(10, 6), color='darkblue')

plt.title('Total Immigration to Canada from 1980 - 2013')
plt.xlabel('Year')
plt.ylabel('Number of Immigrants')

# Plot line of best fit
plt.plot(x, fit[0] * x + fit[1], color = 'red') # recall that x is the Years
plt.annotate('y={0:.0f} x + {1:.0f}'.format(fit[0], fit[1]), xy=(2000, 150000))

plt.show()

# print out the line of best fit
'No. Immigrants = {0:.0f} * Year + {1:.0f}'.format(fit[0], fit[1])

In [None]:
# Bubble plots

# Step 1: Get the data for Brazil and Argentina
df_can_t = df_can[years].transpose()

# cast the Years (the index) to type int
df_can_t.index = map(int, df_can_t.index)

# Let's label the index. This will automatically be the column name when we reset the index
df_can_t.index.name = 'Year'

# Reset index to bring the Year in as a column
df_can_t.reset_index(inplace=True)

# View the changes 
df_can_t.head()

In [2]:
# Normalize Brazil data
norm_brazil = (df_can_t['Brazil'] - df_can_t['Brazil'].min()) / (df_can_t['Brazil'].max() - df_can_t['Brazil'].min())

# Normalize Argentina data
norm_argentina = (df_can_t['Argentina'] - df_can_t['Argentina'].min()) / (df_can_t['Argentina'].max() - df_can_t['Argentina'].min())

NameError: name 'df_can_t' is not defined

In [None]:
# Brazil

ax0 = df_can_t.plot(kind='scatter',
                   x='Year',
                   y='Brazil',
                   figsize=(14,8),
                   alpha = 0.5, #transparency
                   color='green',
                   s=norm_brazil * 2000 + 10, # pass in weights
                   xlim = (1975,2015)
                   )

# Argentina

ax1 = df_can_t.plot(kind='scatter',
                   x='Year',
                   y='Argentina',
                   figsize=(14,8),
                   alpha = 0.5, #transparency
                   color='blue',
                   s=norm_argentina * 2000 + 10, # pass in weights
                   ax=ax0
                   )

ax0.set_ylabel('Number of Immigrants')
ax0.set_title('Immigration from Brazil and Argentina from 1980 to 2013')
ax0.legend(['Brazil', 'Argentina'], loc='upper left', fontsize='x-large')