# <span style="color:red"> Introduction to Python, Part 4 </span>

In [19]:
# !pip install datascience   # You need to un-comment this line when you
                             # run the notebook on Azure

In [20]:
# Import code libraries or "modules" in Python lingo

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import importlib
import sys
from datascience import *
sns.set_style("whitegrid")
%matplotlib inline

A summary of the methods to Table is [here](http://data8.org/datascience/tables.html) <br>
A tutorial for the datascience module is [here](http://data8.org/datascience/tutorial.html) <br>
A cheatsheet for the datascience module is [here](https://github.com/wstuetzle/STAT180/blob/master/Computing/data8_sp17_midterm_ref_sheet.pdf)

**Note**

If you want to run this notebook on Azure, its easiest to upload the notebook and the file "datascience_extensions.py" (on Canvas) and then replace the cell below by a code cell with the single line

from datascience_extensions import * 

In [21]:
# Import some new methods for class Table
# Add IDS directory to search path

course_dir = "/Users/wxs/Dropbox/IDS/Git-reps/STAT180/"
computing_dir = course_dir + "Computing"

if computing_dir not in sys.path:
    sys.path.append(computing_dir)

from datascience_extensions import *

# Reload the extensions after we make a change
# Importing it again does not work - a module is imported only once
module_name = "datascience_extensions"
importlib.reload(sys.modules[module_name])

<module 'datascience_extensions' from '/Users/wxs/Dropbox/IDS/Git-reps/STAT180/Lectures/datascience_extensions.py'>

In [22]:
# Read titanic-table.csv from URL
titanic_table_url = "https://github.com/wstuetzle/STAT180/raw/master/Data/titanic3-table.csv"
titanic = Table.read_table(titanic_table_url)
titanic.show(2)
titanic.shape()

pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.338,B5,S,2,,"St Louis, MO"
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"


[1309, 14]

In [23]:
# Let's make a subset of the Titanic data that has only the variables
# of interest to us, and only complete rows

titanic_sub = titanic.drop("name", "sibsp", "parch", "ticket", "fare", "cabin", "boat", 
                           "body", "home.dest").take_complete_rows()
titanic_sub.show(3)
titanic_sub.shape()

pclass,survived,sex,age,embarked
1,1,female,29.0,S
1,1,male,0.9167,S
1,0,female,2.0,S


[1044, 5]

### <span style="color:blue"> For statements </span>

Perform a computation for  every element in a list or array

In [24]:
# Example:

sequence = np.arange(5)
for index in sequence:
    print(index)
print("Done")

0
1
2
3
4
Done


In [25]:
# Example:

weekdays = ["M", "T", "W", "Th", "F"]
for day in weekdays:
    print(day)

M
T
W
Th
F


In [26]:
type(weekdays)

list

**Note:** The indented statements below the "for ...." line are called the body of the loop. The body has to be indented (tab or 4 spaces). A non-indented statement indicates the end of the body.

In [27]:
# Example: For the titanic data, compute the conditional distribution
# of survived, given pclass

cond_dist = titanic_sub.pivot("survived", "pclass")
num_rows = cond_dist.num_rows
num_columns = cond_dist.num_columns
column_labels = cond_dist.labels
row_sum = np.full(num_rows, 0)  # Makes an array with num_rows elements, 
                                # all equal to 0
for i in np.arange(1, num_columns):
    row_sum = row_sum + cond_dist.column(i)

for column_index in np.arange(1, num_columns):
    the_column = cond_dist.column(column_index)
    rel_freq = the_column / row_sum
    cond_dist = cond_dist.with_column(column_labels[column_index], rel_freq)
    
conditional_given_pclass = cond_dist
conditional_given_pclass

pclass,0.0,1.0
1,0.365248,0.634752
2,0.559387,0.440613
3,0.738523,0.261477


### <span style="color:blue"> Defining functions </span>

Functions are a means of packaging computations. Above is block of Python code that computes the conditional distributions of survived, given pclass. We might also be interested in the conditional distributions of survived, given sex, or the conditional distributions of sex given pclass, or....

The code to calulate any of these would be essentially the same - only the names of the features would change.
We could copy the code block and change the feature names, but that would be error prone and needlessly repetitive.

In [28]:
# Here is a simple function definition

def double (arg):
    '''Double the argument''' 
    result = 2 * arg
    return result

double(5)
    

10

In [37]:
# Show signature and docstring
double?

In [42]:
# A more complex example

def double_with_twist (x):
    '''Illustrate that functions have their own symbol tables'''
    result = 2 * x
    a = x
    x = "changed argument" 
    print("x inside function = " + str(x))
    print("a inside function = " + str(a))          
    return result

In [43]:
a = 7
x = 5
y = double_with_twist(x)
print("y = " + str(y))
print("a = " + str(a))
print("x = " + str(x))

x inside function = changed argument
a inside function = 5
y = 10
a = 7
x = 5


**Note**<br> Even though x and a were assigned values "changed argument" and 5 inside the function, that did not change the global values of x and a.

**Conceptually** <br>

* The function "double_with_twist" has its own symbol table containing variables named "x", "a", "result".
* When "double_with_twist" is called, the value of "x" is set to the value passed in by the function call (5, in our case)

* The statements in the body of "double_with_twist" are executed

* The result is passed back, and the values of the symbols in the symbol table of "double_with_twist" are forgotten




In [32]:
# An example for what not to do

def multiply_by_factor(arg):
    product = factor * arg
    return product

In [33]:
# x = 5
# result = multiply_by_factor(x)
# print(result)

# That bombs because factor is not an argument and is not defined inside the function
# The variable named "factor" is a "free variable" in "multiply_by_factor"

In [34]:
# Let's define a global variable named "factor"

x = 5
factor = 10
result = multiply_by_factor(x)
print(result)


50


**What's happening?** <br>
The value of "factor" in the symbol table of "multiply_by_factor" is undefined. So the interpreter looks for a variable named "factor" in the global symbol table and uses its value.

This is **bad**: The result returned by a function should depend only on its arguments.

**Note** There are exceptions, and the explanation is only correct if all the functions are defined in the global environment - no functions defined within functions.

### <span style="color:blue"> A more realistic example for a function definition </span>

Let's package the computation of conditional distributions into a function

In [35]:
def conditional_distribution(table, feature, given_feature):
    '''For the dataset table calculate the conditional distributions of a 
    categorical feature, given another categorical feature'''
    cond_dist = table.pivot(feature, given_feature)
    num_rows = cond_dist.num_rows
    num_columns = cond_dist.num_columns
    column_labels = cond_dist.labels
    row_sum = np.full(num_rows, 0) 
    for i in np.arange(1, num_columns):
        row_sum = row_sum + cond_dist.column(i)
    for column_index in np.arange(1, num_columns):
        the_column = cond_dist.column(column_index)
        rel_freq = the_column / row_sum
        cond_dist = cond_dist.with_column(column_labels[column_index], rel_freq)
    return cond_dist


In [36]:
conditional_distribution(titanic_sub, "survived", "pclass")

pclass,0.0,1.0
1,0.365248,0.634752
2,0.559387,0.440613
3,0.738523,0.261477
