# Week 4 - Numpy and Pandas

In [None]:
import pandas as pd
import numpy as np
import random

## Numpy
* NumPy gives Python fast, homogeneous n-dimensional arrays.
* NumPy arrays are distinct from Python's base `list`

In [None]:
# Heterogeneous `list`s are A-OK
# We can easily mix types with the built-in Python list system
python_list = [True, 0, 'my string', {1:2, 3:4}]

In [None]:
# NumPy arrays most performant when they're homogenous
numpy_arr = np.array([1, 2, 3, 4, 5])
print(numpy_arr)
print(numpy_arr.dtype)

In [None]:
# If we pass more than one type to a NumPy array, everything becomes an 'object'
# :(
# (This is OK, but we use NumPy for fast things, not objects)
numpy_fail = np.array(python_list)
print(numpy_fail.dtype)

In [None]:
# NumPy allows you to do operations on entire arrays at once
my_arr = np.arange(100)
print("Just some integers: " + str(my_arr[:10]))

my_arr += 10
print("Integers plus 10: " + str(my_arr[:10]))

my_arr %= 3
print("Integers plus 10 mod 3: " + str(my_arr[:10]))

In [None]:
# NumPy also gives you some helper functions
my_arr = np.array([random.randint(-1000, 1000) for _ in range(10)])
print(my_arr)
print(my_arr.max())
print(my_arr.min())
print(np.average(my_arr))
print(np.median(my_arr))
print(np.cumsum(my_arr))
print(np.cumprod(my_arr))


In [None]:
# NumPy also supports multidimensional arrays
arr_2d = np.array([np.arange(10 * i, 10 * i + 10) for i in range(10)])
print(arr_2d)

In [None]:
# Arrays can also be easily 'reshaped'
print(arr_2d.reshape(20, 5))
print(arr_2d.reshape(2, 50))

In [None]:
# We can do slices on 1d and 2d numpy arrays
print(np.arange(10)[5:])
print(np.arange(100).reshape(10, 10)[5:,5:])

In [None]:
# We can also do slices on n-dimensional arrays.
# (This gets confusing, but is *extremely* useful for high-dimensioned
#  datasets)
arr_4d = np.arange(1000).reshape(20,5,10)
print(arr_4d[9:,3:,7:])

In [None]:
# Deep copying of arrays can be important
a = np.arange(10)
b = np.copy(a)
a += 10
print(a)
print(b)

In [None]:
# Universal functions are pretty cool
a = np.arange(10)
b = (10 * a) + 5
print(b)

c = np.sin(a / np.pi)
print(c)

## Pandas
* Allows us to work on labeled data very quickly
* Mirrors a lot of the practices of core Python, but with a focus on data
* Has a very *fast* backend to perform operations much quicker than our naive approaches

In [None]:
# 'Series' in Python can be thought of kindof like NumPy arrays
s = pd.Series(np.arange(10, 20))
s
# Notice how 's' is indexed:

In [None]:
# We can do similar array-level operations as NumPy
s += 10
print(s)
s /= 5
print(s)

In [None]:
# The core of Pandas is the DataFrame
# DataFrame can be thought of like interactive SQL tables.
# We can select by columns, index and reindex the data, do groupby operations,
# and many, many other cool things

my_df = pd.DataFrame({
        'name': ('Tyler', 'Ben'),
        'school': ('UIUC', 'UIUC'),
        'likes_data': (True, True)
    })
print(my_df)
print(my_df.dtypes) # Series have dtypes just like NumPy

In [None]:
other_df = pd.DataFrame({
        'student_id': [random.randint(0, 10000) for _ in range(100)],
        'gpa': [random.random() * 4 for _ in range(100)],
        'major': [random.choice(['CS', 'CS+Math', 'Stats']) for _ in range(100)],
        'in_state': [random.choice([True, False]) for _ in range(100)]
    })
other_df.head() 

In [None]:
other_df.describe()

In [None]:
# We can drill-down into columns easily
other_df.gpa
other_df['gpa']

In [None]:
# We can select by column values easily too
print(other_df[other_df.major == 'CS'])
print(other_df[other_df.gpa > 3])

In [None]:
# We can also combine this with a groupby to get more interesting stats
print(other_df.groupby('major').mean().gpa)

In [None]:
# We can combine selections too
other_df[(other_df.gpa > 3) & (other_df.major == 'Stats')]

In [None]:
# It can sometimes be useful to alter the index of a dataframe
other_df.set_index('student_id')

## Challanges:

In [None]:
titanic_df = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv')
titanic_df.head()

1) Calculate the survival rates of passengers by class (First, Second, Third)


2) Calculate the average fare paid by those who survived compared to the fare paid by those who didn't


3) Plot the ages of the female survivors that embarked at Cherbourg