# Python Libraries 1

_September 16, 2020_ 

Agenda today:
- Introduction to Numpy: array math
- Introduction to Pandas: importing, indexing, and math

In [None]:
!pip install pandas

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Part I. Numpy
The basic data structure that exist in numpy is called numpy arrays. NP arrays are very similar to python lists. The __difference__ between a python list and a Numpy array is that list can only a mix of data types but array can only contain the same data type.

But what is the benefits of using NumPy array instead of the base python lists?
- Speed 
- Broadcasting Property

In [None]:
names_list=['Bob','John','Sally']
names_array=np.char.array(['Bob','John','Sally']) #use numpy.array for numbers and numpy.char.array for strings
print(names_list)
print(names_array)

In [None]:
import time

size_of_seq = 100000

def pure_python_version():
    tic = time.time()
    X = range(size_of_seq)
    Y = range(size_of_seq)
    Z = [X[i] + Y[i] for i in range(len(X)) ]
    toc = time.time()
    return toc - tic

def numpy_version():
    tic = time.time()
    X = np.arange(size_of_seq)
    Y = np.arange(size_of_seq)
    Z = X + Y 
    toc = time.time()
    return toc - tic


t1 = pure_python_version()
t2 = numpy_version()
print("python: " + str(t1), "numpy: "+ str(t2))
print("Numpy is in this example " + str(t1/t2) + " times faster!")

In [None]:
## broadcasting and array math

np.array([2,3,4,6]) * 5

In [None]:
li = [2,3,4,6]
li * 5

In [None]:
## simulation with numpy - in normal distribution 
rand = np.random.randn(100000);
plt.hist(rand, bins = 200);

There are many other wonderous things numpy can do, you will encounter them later in the course of the program. 

## Part II. Pandas
Pandas stand for paneled data, and it is the most popular library for data scientists to manipulate, clean, and organize dataset in Python. The most fundamental data structure that exists in Pandas is called **DataFrames**. 

In [55]:
## importing data and look at optional parameters
df = pd.read_csv('auto-mpg.csv')
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [None]:
# examine and read the data

In [56]:
# examine the information in this dataframe
df.info

<bound method DataFrame.info of       mpg  cylinders  displacement horsepower  weight  acceleration  \
0    18.0          8         307.0        130    3504          12.0   
1    15.0          8         350.0        165    3693          11.5   
2    18.0          8         318.0        150    3436          11.0   
3    16.0          8         304.0        150    3433          12.0   
4    17.0          8         302.0        140    3449          10.5   
..    ...        ...           ...        ...     ...           ...   
393  27.0          4         140.0         86    2790          15.6   
394  44.0          4          97.0         52    2130          24.6   
395  32.0          4         135.0         84    2295          11.6   
396  28.0          4         120.0         79    2625          18.6   
397  31.0          4         119.0         82    2720          19.4   

     model year  origin                   car name  
0            70       1  chevrolet chevelle malibu  
1        

In [57]:
# examine the datatypes of the dataframe
df.describe

<bound method NDFrame.describe of       mpg  cylinders  displacement horsepower  weight  acceleration  \
0    18.0          8         307.0        130    3504          12.0   
1    15.0          8         350.0        165    3693          11.5   
2    18.0          8         318.0        150    3436          11.0   
3    16.0          8         304.0        150    3433          12.0   
4    17.0          8         302.0        140    3449          10.5   
..    ...        ...           ...        ...     ...           ...   
393  27.0          4         140.0         86    2790          15.6   
394  44.0          4          97.0         52    2130          24.6   
395  32.0          4         135.0         84    2295          11.6   
396  28.0          4         120.0         79    2625          18.6   
397  31.0          4         119.0         82    2720          19.4   

     model year  origin                   car name  
0            70       1  chevrolet chevelle malibu  
1      

In [None]:
# talk about series and dataframe 
type(df)

In [None]:
# series

In [None]:
# indexing and subsetting 

# index by values


In [58]:
# want weight greater than 3000
df[(df["weight"] > 3000)]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
364,26.6,8,350.0,105,3725,19.0,81,1,oldsmobile cutlass ls
365,20.2,6,200.0,88,3060,17.1,81,1,ford granada gl
366,17.6,6,225.0,85,3465,16.6,81,1,chrysler lebaron salon
374,23.0,4,151.0,?,3035,20.5,82,1,amc concord dl


In [59]:
# examine whether we have missing value - it could really affect the data!
df.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

In [None]:
# exercise - want weight greater than 3000 and mpg less than 18 


In [3]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [9]:
def stringtoint(string):
    return float(string)

In [49]:
# exercise - get the cars that are heavier than 3000 but has a horsepower less than 150 
# should be 150 cars that satisfy the criteria 
df = df.replace("?",0)
df["horsepower"] = df.horsepower.astype('int')
newdf = df[(df["weight"] > 3000) & (df["horsepower"] < 150)]
newdf.info

<bound method DataFrame.info of       mpg  cylinders  displacement  horsepower  weight  acceleration  \
0    18.0          8         307.0         130    3504          12.0   
4    17.0          8         302.0         140    3449          10.5   
34   16.0          6         225.0         105    3439          15.5   
35   17.0          6         250.0         100    3329          15.5   
36   19.0          6         250.0          88    3302          15.5   
..    ...        ...           ...         ...     ...           ...   
364  26.6          8         350.0         105    3725          19.0   
365  20.2          6         200.0          88    3060          17.1   
366  17.6          6         225.0          85    3465          16.6   
374  23.0          4         151.0           0    3035          20.5   
387  38.0          6         262.0          85    3015          17.0   

     model year  origin                           car name  
0            70       1          chevrolet

In [19]:
# seems like we have a problem! Can we try to debug this?

In [None]:
# get rid of anomaly


In [53]:
# exercise - get a list of car name where the mpg is less than 18 and weight is greater than 3500
badcars = df[(df["mpg"] < 18) & (df["weight"] > 3500)]["car name"]
badcars

1              buick skylark 320
5               ford galaxie 500
6               chevrolet impala
7              plymouth fury iii
8               pontiac catalina
                 ...            
285    chevrolet caprice classic
286              ford ltd landau
287        mercury grand marquis
289      buick estate wagon (sw)
290     ford country squire (sw)
Name: car name, Length: 91, dtype: object

In [54]:
list(badcars)

['buick skylark 320',
 'ford galaxie 500',
 'chevrolet impala',
 'plymouth fury iii',
 'pontiac catalina',
 'amc ambassador dpl',
 'dodge challenger se',
 "plymouth 'cuda 340",
 'chevrolet monte carlo',
 'ford f250',
 'chevy c20',
 'dodge d200',
 'hi 1200d',
 'chevrolet impala',
 'pontiac catalina brougham',
 'ford galaxie 500',
 'plymouth fury iii',
 'dodge monaco (sw)',
 'ford country squire (sw)',
 'pontiac safari (sw)',
 'chevrolet impala',
 'pontiac catalina',
 'plymouth fury iii',
 'ford galaxie 500',
 'amc ambassador sst',
 'mercury marquis',
 'buick lesabre custom',
 'oldsmobile delta 88 royale',
 'chrysler newport royal',
 'amc matador (sw)',
 'chevrolet chevelle concours (sw)',
 'ford gran torino (sw)',
 'plymouth satellite custom (sw)',
 'buick century 350',
 'amc matador',
 'chevrolet malibu',
 'ford gran torino',
 'dodge coronet custom',
 'mercury marquis brougham',
 'chevrolet caprice classic',
 'ford ltd',
 'plymouth fury gran sedan',
 'chrysler new yorker brougham',
 'b