<title>Intro to Data Science by Blueprint</title>

<h1>Intro to Data Science by Blueprint</h1>


<h2>Import Libraries</h2>

In [1]:
# Ignore all warnings because we live dangerously like that
import warnings
warnings.filterwarnings('ignore')

import itertools # Iteration tools
import matplotlib.pyplot as plt # plotting library
import numpy as np # linear algebra, number processing
import pandas as pd # data processing, data frames. CSV file I/O (e.g. pd.read_csv)
import statsmodels.api as sm # statistic models i.e. linear regression
import geopandas as gpd # data processing with geospatial data
from geopandas.plotting import (plot_linestring, plot_point, norm_cmap) # Plotting geospatial data

# Bokeh is a visualization library that leverages JS and is similar to D3.js
from bokeh.io import show, output_notebook, push_notebook 
from bokeh.plotting import figure, ColumnDataSource
from bokeh.models import HoverTool

from collections import defaultdict

# Additional imports for matplotlib
from matplotlib.lines import Line2D
from matplotlib.colors import Normalize
from matplotlib import cm
from sklearn import preprocessing # data preprocessing

<h2>Set up Notebook</h2>

In [2]:
# Show matplotlib plots in this notebook
%matplotlib inline
# Show bokeh plots in this notebook
output_notebook()

# Setting plot parameters
from pylab import rcParams
params = {
    'figure.figsize': (16, 10),
    'legend.fontsize': 15
}
rcParams.update(params)

<h2>Define Helper Functions</h2>

In [3]:
def plot_polygon(ax, poly, facecolor='red', edgecolor='black', alpha=0.5, linewidth=1):
    """ Plot a single Polygon geometry """
    from descartes.patch import PolygonPatch
    a = np.asarray(poly.exterior)
    # without Descartes, we could make a Patch of exterior
    ax.add_patch(PolygonPatch(poly, facecolor=facecolor, alpha=alpha))
    ax.plot(a[:, 0], a[:, 1], color=edgecolor, linewidth=linewidth)
    for p in poly.interiors:
        x, y = zip(*p.coords)
        ax.plot(x, y, color=edgecolor, linewidth=linewidth)

def plot_multipolygon(ax, geom, facecolor='red', edgecolor='black', alpha=0.5, linewidth=1):
    """ Can safely call with either Polygon or Multipolygon geometry
    """
    if geom.type == 'Polygon':
        plot_polygon(ax, geom, facecolor=facecolor, edgecolor=edgecolor, alpha=alpha, linewidth=linewidth)
    elif geom.type == 'MultiPolygon':
        for poly in geom.geoms:
            plot_polygon(ax, poly, facecolor=facecolor, edgecolor=edgecolor, alpha=alpha, linewidth=linewidth)


def plot_geodataframe(s, column=None, colormap=None, alpha=0.5,
                   categorical=False, legend=False, axes=None, linewidth=1):
    """ Plot a GeoDataFrame

        Generate a plot of a GeoDataFrame with matplotlib.  If a
        column is specified, the plot coloring will be based on values
        in that column.  Otherwise, a categorical plot of the
        geometries in the `geometry` column will be generated.

        Parameters
        ----------

        GeoDataFrame
            The GeoDataFrame to be plotted.  Currently Polygon,
            MultiPolygon, LineString, MultiLineString and Point
            geometries can be plotted.

        column : str (default None)
            The name of the column to be plotted.

        colormap : str (default 'Set1')
            The name of a colormap recognized by matplotlib.

        alpha : float (default 0.5)
            Alpha value for polygon fill regions.  Has no effect for
            lines or points.

        legend : bool (default False)
            Plot a legend (Experimental; currently for categorical
            plots only)

        axes : matplotlib.pyplot.Artist (default None)
            axes on which to draw the plot


        Returns
        -------

        matplotlib axes instance
    """
    
    if colormap is None:
        colormap = 'Set1'
    categories = list(set(s[column].values))
    categories.sort()
    valuemap = dict([(k, v) for (v, k) in enumerate(categories)])
    values = [valuemap[k] for k in s[column]]

    cmap = norm_cmap(values, colormap, Normalize, cm)
    if axes == None:
        fig = plt.gcf()
        fig.add_subplot(111, aspect='equal')
        ax = plt.gca()
    else:
        ax = axes
    for geom, value in zip(s.geometry, values):
        if geom.type == 'Polygon' or geom.type == 'MultiPolygon':
            plot_multipolygon(ax, geom, facecolor=cmap.to_rgba(value), alpha=alpha, linewidth=linewidth)
        elif geom.type == 'LineString' or geom.type == 'MultiLineString':
            plot_multilinestring(ax, geom, color=cmap.to_rgba(value))
        elif geom.type == 'Point':
            plot_point(ax, geom, color=cmap.to_rgba(value))
    if legend:
        patches = []
        size = len(categories)
        # Legend shows the min, 25th, 50th, 75th and max values
        indices = [0, size / 4, size / 2, size / 4 * 3, size - 1]
        categories = [categories[i] for i in indices]
        for value, cat in zip(indices, categories):
            patches.append(Line2D([0], [0], linestyle="none",
                                  marker="o", alpha=alpha,
                                  markersize=15, markerfacecolor=cmap.to_rgba(value)))
        ax.legend(patches, categories, numpoints=1, loc='best')

    return ax

In [4]:
def plot_world_happiness_by_continent(world_happiness_df):
    p = figure(plot_width=600, plot_height=400, tools=['pan' ,'box_zoom','reset', 'wheel_zoom'],
           title="Happiness vs GDP")
    colors = itertools.cycle(["#3366cc", "#dc3912", "#ff9900", "#109618", "#990099", "#0099c6", "#dd4477", "#66aa00", "#b82e2e", "#316395", "#994499", "#22aa99", "#aaaa11", "#6633cc", "#e67300", "#8b0707", "#651067", "#329262", "#5574a6", "#3b3eac"])
    for continent in list(set(world_happiness_df.continent)):
        t = (world_happiness_df
             .drop('geometry', axis=1)
             .rename(columns={'Happiness Score': 'happiness_score'}))
        t = t.loc[t['continent'] == continent]
        p.circle(t['Economy (GDP per Capita)'], t['happiness_score'], legend=continent, fill_color=colors.next(), size=10)
    show(p)
    
def create_hover_tool():
    return HoverTool(tooltips=[
        ("Country", "@Country"),
        ("Happiness Score", "@happiness_score"),
        ("GDP Per Capita", "@gdp_per_cap"),
        ("Continent", "@continent"),
        ("Region", "@Region"),
    ])

def process_dataframe_for_bokeh(df):
    return df.drop('geometry', axis=1).rename(columns={'Happiness Score': 'happiness_score'})

<h2>Initial Exploration</h2>

In [5]:
# Read in the data


In [6]:
# Show the first 5 rows


<h2>Determining Important Features</h2>

In [7]:
# Keep columns that come after the 6th column (including the 6th)
# Retrieve a single column of the hapinness score


In [8]:
# Normalize values to be in between 0 and 1


In [9]:
X.head()

NameError: name 'X' is not defined

In [None]:
# Run Linear regression using the ordinary least squares methods
# which minimizes the error between each point and our line
# F-statistic looks how well the model fits to the data (above 0.05 we reject we have 2.33e-112 so it fits)
# R-Squared in genearl points to good fit


<h1>Visualizing Data on World Map</h1>

We want map data so we can visualize how things appear on a global map. Lets use GeoPandas to help us with this task.

In [None]:
# Ignore outlier countries
# Calculate gdp per capita (notice gdp is in million dollars)


In [None]:
# What do the colours mean? no legend? no title?


In [None]:
# Use our helper function to add legend and thicken colors


In [None]:
# Merge data frames to get happiness data on to our dataset with map data


<h2>Happiness vs GDP Scatterplots</h2>

<h2>Happiness Grouped by Continent</h2>

In [None]:
# Group by continent and take the average score per continent


In [None]:
# create array of numbers from 0 to len(df) - 1
# plot scores at positions on plot
# set tick values to the continent names
