# Data Visualization Tutorial 1

## Import Required Libraries

In [None]:
# import required library functions
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd

## Load UN Canadian Immigration Data
### The data is of yearly country wise immigration into Canada from around the globe between 1980 and 2013

In [None]:
# load data, skip the top 20 and bottom 2 rows as they do not contain relevant data
df_canada = pd.read_excel('data/canada.xlsx',
                          sheet_name = 'Canada by Citizenship',
                          skiprows = range(20),
                          skipfooter = 2)

## Explore Dimentions of the Data

In [None]:
# top 5 rows
df_canada.head()

In [None]:
# bottom 5 rows
df_canada.tail()

In [None]:
# schema information
df_canada.info()

In [None]:
# size of data frame
df_canada.shape

## Clean and Prepare the Data

In [None]:
# conversion index and columns to lists
df_canada.columns.tolist()
df_canada.index.tolist()

# remove unnecessary columns
# in pandas axis=0 re|presents rows (default) and axis=1 represents columns.
df_canada.drop(['AREA','REG','DEV','Type','Coverage'], axis=1, inplace=True)

# rename some columns to make better sense
df_canada.rename(columns={'OdName':'Country', 'AreaName':'Continent', 'RegName':'Region'}, inplace=True)

In [None]:
# convert all column names to strings
df_canada.columns = list(map(str, df_canada.columns))

# full range of the time series
years = list(map(str, range(1980, 2014)))

# add Total column
df_canada['Total'] = df_canada.sum(axis=1)

In [None]:
# size of data frame
df_canada.shape

## Validate Data

In [None]:
# check for missing data
df_canada.isnull().sum()

# data numerical summary
df_canada.describe()

## Index the Data
### Enables lookup by country

In [None]:
# index data by country
df_canada.set_index('Country', inplace=True)

In [None]:
# print sample data
print(df_canada.loc['Japan'])
#print(df_canada.loc['Japan', 2000])

## Visualize Continuous Time Series

### Visualize immigration from Haiti to Canada from 1980 to 2013.

In [None]:
# plot immigration pattern for Haiti
df_canada.loc['Haiti', years].transpose().plot(kind = 'line')
plt.title('Immigration from Haiti')
plt.ylabel('Number of immigrants')
plt.xlabel('Years')

plt.show()

### Visualize immigration from India and China to Canada from 1980 to 2013.

In [None]:
# plot immigration pattern for India and China
df_IndiaChina = df_canada.loc[['India', 'China'], years]
df_IndiaChina.transpose().plot(kind = 'line')
plt.title('Immigration from India and China')
plt.ylabel('Number of immigrants')
plt.xlabel('Years')

plt.show()

## Visualize Discrete Data

### Visualize top 5 immigration contributions into Canada

In [None]:
# sort data by Total
df_canada.sort_values(by='Total', ascending=False, axis=0, inplace=True)

# pick top 5 contibutors
df_top5 = df_canada.head(5)['Total']

# print data.
df_top5.head()

In [None]:
# plot immigration pattern of top 5
df_top5.transpose().plot(kind = 'bar')
plt.title('Top 5 Immigration Contributors')
plt.ylabel('Number of immigrants')
plt.xlabel('Countries')

plt.show()

## Assignment

### Visualize total year wise immigration into Canada from 1980 to 2013 from all sources