# Python Libraries 1

_October 28, 2020_ 

Agenda today:
- Introduction to Numpy: array math
- Introduction to Pandas: importing, indexing, and math

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Part I. Numpy
The basic data structure that exist in numpy is called numpy arrays. NP arrays are very similar to python lists. The __difference__ between a python list and a Numpy array is that list can only a mix of data types but array can only contain the same data type.

But what is the benefits of using NumPy array instead of the base python lists?
- Speed 
- Broadcasting Property

In [None]:
names_list=['Bob','John','Sally']
names_array=np.char.array(['Bob','John','Sally']) #use numpy.array for numbers and numpy.char.array for strings
print(names_list)
print(names_array)

In [None]:
# what happens if i put characters and integer in an arrray?

In [None]:
import time

size_of_seq = 100000

def pure_python_version():
    tic = time.time()
    X = range(size_of_seq)
    Y = range(size_of_seq)
    Z = [X[i] + Y[i] for i in range(len(X)) ]
    toc = time.time()
    return toc - tic

def numpy_version():
    tic = time.time()
    X = np.arange(size_of_seq)
    Y = np.arange(size_of_seq)
    Z = X + Y 
    toc = time.time()
    return toc - tic


t1 = pure_python_version()
t2 = numpy_version()
print("python: " + str(t1), "numpy: "+ str(t2))
print("Numpy is in this example " + str(t1/t2) + " times faster!")

In [None]:
## broadcasting and array math


In [None]:
# list does not broadcast

In [None]:
## simulation with numpy - in normal distribution 
rand = np.random.randn(100000);
plt.hist(rand, bins = 200);

There are many other wonderous things numpy can do, you will encounter them later in the course of the program. 

## Part II. Pandas
Pandas stand for paneled data, and it is the most popular library for data scientists to manipulate, clean, and organize dataset in Python. The most fundamental data structure that exists in Pandas is called **DataFrames**. 

In [2]:
## importing data and look at optional parameters
df = pd.read_csv('auto-mpg.csv')

In [None]:
df# examine and read the data

In [None]:
# examine the information in this dataframe
df.sample(25)

In [None]:
# examine the columns
df.columns

In [None]:
df.index

In [None]:
# examine the datatypes of the dataframe
df.dtypes

In [None]:
# talk about series and dataframe 


In [None]:
# series
df.mpg

In [None]:
# indexing and subsetting 
df['mpg']
# index by values (loc and iloc)


In [None]:
# 2nd - 10th row, first 3 columns
df.iloc[2:11, :3]

In [None]:
# want weight greater than 3000
df[df['weight'] > 3000]

In [None]:
# examine whether we have missing value - it could really affect the data!
df.isna

In [None]:
# exercise - get a list of car name where the mpg is less than 18 and weight is greater than 3500
df[(df['mpg'] < 18) & (df['weight'] > 3500)]

In [13]:
df[
    (pd
     .to_numeric(df['horsepower']
     .replace('?', '')) < 150)
     & (df['weight'] > 3000)
]['car name']

0              chevrolet chevelle malibu
4                            ford torino
34             plymouth satellite custom
35             chevrolet chevelle malibu
36                       ford torino 500
                     ...                
363                        buick century
364                oldsmobile cutlass ls
365                      ford granada gl
366               chrysler lebaron salon
387    oldsmobile cutlass ciera (diesel)
Name: car name, Length: 100, dtype: object

In [None]:
# (more advanced) exercise - get the cars that are heavier than 3000 but has a horsepower less than 150 


In [7]:
# solution one
df.loc[(df['mpg'] < 18) & (df['weight'] > 3500), 'car name']

1              buick skylark 320
5               ford galaxie 500
6               chevrolet impala
7              plymouth fury iii
8               pontiac catalina
                 ...            
285    chevrolet caprice classic
286              ford ltd landau
287        mercury grand marquis
289      buick estate wagon (sw)
290     ford country squire (sw)
Name: car name, Length: 91, dtype: object

In [None]:
# solution two
res = df[(df['mpg'] < 18) & (df['weight'] > 3500)]
list(res['car name'])

In [None]:
  df.loc[(df['mpg'] < 18) & (df['weight'] > 3500), 'car name']
res = df[(df['mpg'] < 18) & (df['weight'] > 3500)]
list(res['car name'])

      df[(df['mpg'] < 18) & (df['weight'] > 3500)]

In [6]:
df[(df['mpg'] < 18) & (df['weight'] > 3500)]['car name']

1              buick skylark 320
5               ford galaxie 500
6               chevrolet impala
7              plymouth fury iii
8               pontiac catalina
                 ...            
285    chevrolet caprice classic
286              ford ltd landau
287        mercury grand marquis
289      buick estate wagon (sw)
290     ford country squire (sw)
Name: car name, Length: 91, dtype: object

In [8]:
df.loc[(df['mpg'] < 18) & (df['weight'] > 3500)]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
5,15.0,8,429.0,198,4341,10.0,70,1,ford galaxie 500
6,14.0,8,454.0,220,4354,9.0,70,1,chevrolet impala
7,14.0,8,440.0,215,4312,8.5,70,1,plymouth fury iii
8,14.0,8,455.0,225,4425,10.0,70,1,pontiac catalina
...,...,...,...,...,...,...,...,...,...
285,17.0,8,305.0,130,3840,15.4,79,1,chevrolet caprice classic
286,17.6,8,302.0,129,3725,13.4,79,1,ford ltd landau
287,16.5,8,351.0,138,3955,13.2,79,1,mercury grand marquis
289,16.9,8,350.0,155,4360,14.9,79,1,buick estate wagon (sw)


In [9]:
df[df['mpg'] < 18][df['weight'] < 3500]

  """Entry point for launching an IPython kernel.


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
13,14.0,8,455.0,225,3086,10.0,70,1,buick estate wagon (sw)
34,16.0,6,225.0,105,3439,15.5,71,1,plymouth satellite custom
35,17.0,6,250.0,100,3329,15.5,71,1,chevrolet chevelle malibu
98,16.0,6,250.0,100,3278,18.0,73,1,chevrolet nova custom
121,15.0,8,318.0,150,3399,11.0,73,1,dodge dart custom
128,15.0,6,250.0,100,3336,17.0,74,1,chevrolet nova
154,15.0,6,250.0,72,3432,21.0,75,1,mercury monarch
155,15.0,6,250.0,72,3158,19.5,75,1,ford maverick
