# Data Visualization Tutorial 4

## Load and Setup the Data

In [None]:
# import required library functions
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# load data, skip the top 20 and bottom 2 rows as they do not contain relevant data
df_canada = pd.read_excel('data/canada.xlsx',
                          sheet_name = 'Canada by Citizenship',
                          skiprows = range(20),
                          skipfooter = 2)

# conversion index and columns to lists
df_canada.columns.tolist()
df_canada.index.tolist()

# remove unnecessary columns
# in pandas axis=0 re|presents rows (default) and axis=1 represents columns.
df_canada.drop(['AREA','REG','DEV','Type','Coverage'], axis=1, inplace=True)

# rename some columns to make better sense
df_canada.rename(columns={'OdName':'Country', 'AreaName':'Continent', 'RegName':'Region'}, inplace=True)

# convert all column names to strings
df_canada.columns = list(map(str, df_canada.columns))

# full range of the time series
years = list(map(str, range(1980, 2014)))

# add Total column
df_canada['Total'] = df_canada.sum(axis=1)

# index data by country
df_canada.set_index('Country', inplace=True)

## Visualize Frequency Distribution

### Visualize immigration into Canada from Scandinavia

In [None]:
# let's quickly view the dataset 
df_scandinavia = df_canada.loc[['Denmark', 'Norway', 'Sweden'], years]

df_scandinavia.head()

In [None]:
# let's get the x-tick values
count, bin_edges = np.histogram(df_scandinavia.transpose(), 15)

# generate histogram
df_scandinavia.transpose().plot(kind='hist',
                    figsize=(10, 6),
                    bins=15,
                    alpha=0.6,
                    xticks=bin_edges,
                   )

plt.title('Histogram of Immigration from Denmark, Norway, and Sweden from 1980 - 2013')
plt.ylabel('Number of Years')
plt.xlabel('Number of Immigrants')

plt.show()

## Visualize Data Distribution

### Visualize Box Plot of Immigration to Canada from India

In [None]:
# to get a dataframe, place extra square brackets around 'India'.
df_india = df_canada.loc[['India'], years].transpose()
df_india.head()

In [None]:
df_india.describe()

In [None]:
# plot the distribution pattern
df_india.plot(kind='box', figsize=(8, 6))

plt.title('Box plot of Indian Immigrants from 1980 - 2013')
plt.ylabel('Number of Immigrants')

plt.show()

## Visualize Word Coulds

In [None]:
# special library for Word Clouds
from wordcloud import WordCloud, STOPWORDS

### Prepare word histogram data

In [None]:
# load list of common words to remove from consideration
stopwords = set(STOPWORDS)

# open the file and read it into a variable
novel = open('data/alice.txt', 'r').read()

# instantiate a word cloud object
novel_wc = WordCloud(
    background_color='white',
    max_words=2000,
    stopwords=stopwords
)

# generate the word cloud data
novel_wc.generate(novel)

### Visualize the occurence frequency of words in novel

In [None]:
# display the word cloud
plt.imshow(novel_wc, interpolation='bilinear')
plt.axis('off')
plt.show()

### Visualize the prominent countries contributing to immigration to Canada

In [None]:
# create country wise distribution
df_country = df_canada['Total']

df_country.head()

In [None]:
# generate word cloud from histogram
wc_country = WordCloud(background_color='white').generate_from_frequencies(df_country)

In [None]:
# display the word cloud
plt.imshow(wc_country, interpolation='bilinear')
plt.axis('off')
plt.show()