# Learning the Pandas Library

In [1]:
import pandas as pd
pd.__version__

'1.4.4'

## Series

In [None]:
# pandas Series are 1D indexed arrays of data
mySeries = pd.Series([0, 1, 2, 4.2, "hi"])

print("mySeries:", mySeries, end="\n\n")

print("values:", mySeries.values, end="\n\n")

print("index:", mySeries.index)
print("index-start:", mySeries.index.start, end="\n\n")

print("access index 0:", mySeries[0], end="\n\n")

print("access range 0-1:", mySeries[0:2].values, end="\n\n")

print("reverse mySeries:", mySeries[::-1].values, end="\n\n")

## we can explicitly list indices, so they can be thought of as a python dictionary
mySeries = pd.Series([0, 1, 2, 4.2, "hi"], index=[2, 4, 6, 8, 'a'])

print("explicitly indexed:", mySeries)

print("access index 'a':", mySeries['a'])

## DataFrames

In [11]:
# DataFrames are two dimensional arrays, with flexible columns and rows

ice_cream_price_dict = {'Chocolate': 2.4, 'Vanilla': 2.1, 'Coffee': 2.6, 'Coconut': 1.8}
ice_cream_prices = pd.Series(ice_cream_price_dict)

ice_cream_popularity_dict = {'Chocolate': 82, 'Vanilla': 94, 'Coffee': 71, 'Coconut': 57}
ice_cream_popularities = pd.Series(ice_cream_popularity_dict)

ice_cream = pd.DataFrame({'price': ice_cream_prices, 'popularity': ice_cream_popularities})

print(ice_cream, end="\n\n")
print("indexes:", ice_cream.index, end="\n\n")
print("columns:", ice_cream.columns, end="\n\n")
print("prices:", ice_cream['price'], end="\n\n")
print("popularity:", ice_cream['popularity'], end="\n\n")

# if keys are missing from a column, then it will be filled in by NaN
# DataFrames can be created from:
#   - from a single Series object
#   - from a list of dicts
#   - from a dictionary of Series objects
#   - from a two-dimesnional NumPy array
#   - from a NumPy structured array

           price  popularity
Chocolate    2.4          82
Vanilla      2.1          94
Coffee       2.6          71
Coconut      1.8          57

indexes: Index(['Chocolate', 'Vanilla', 'Coffee', 'Coconut'], dtype='object')

columns: Index(['price', 'popularity'], dtype='object')

prices: Chocolate    2.4
Vanilla      2.1
Coffee       2.6
Coconut      1.8
Name: price, dtype: float64

popularity: Chocolate    82
Vanilla      94
Coffee       71
Coconut      57
Name: popularity, dtype: int64



## Index

In [23]:
# pandas Index is an immutable array/ordered set

ind = pd.Index([1, 2, 3, 5, 8])
print(ind)
print("index 0:", ind[0])
print("size:", ind.size)
print("shape:", ind.shape)

print()

ind1 = pd.Index([1, 4, 8])
print("intersection:", ind.intersection(ind1))
print("union:", ind.union(ind1))
print("symmetric difference:", ind.difference(ind1))

Int64Index([1, 2, 3, 5, 8], dtype='int64')
index 0: 1
size: 5
shape: (5,)

intersection: Int64Index([1, 8], dtype='int64')
union: Int64Index([1, 2, 3, 4, 5, 8], dtype='int64')
symmetric difference: Int64Index([2, 3, 5], dtype='int64')


## Vectorized Operations (String Manipulation)

In [5]:
name_list = ['Taylor', 'molly', None, 'fElix', 'Maggie', 'meiMei']

names = pd.Series(name_list)
names = names.str.capitalize()
print(names)

## built-in to pandas
# len()	lower()	translate()	islower()
# ljust()	upper()	startswith()	isupper()
# rjust()	find()	endswith()	isnumeric()
# center()	rfind()	isalnum()	isdecimal()
# zfill()	index()	isalpha()	split()
# strip()	rindex()	isdigit()	rsplit()
# rstrip()	capitalize()	isspace()	partition()
# lstrip()	swapcase()	istitle()	rpartition()


## use regular expressions:
#match()
#extract()
#findall()
#replace()
#contains()
#count()
#split()
#rsplit()

## misc
#get()
#slice()
#slice_replace()
#cat()
#repeat()
#normalize()
#pad()
#wrap()
#join()
#get_dummies()

0    Taylor
1     Molly
2      None
3     Felix
4    Maggie
5    Meimei
dtype: object


## Time

In [15]:
## native python: not perfect for large arrays
from datetime import datetime
from dateutil import parser

date = datetime(year=2022, month=10, day=4)
print(date)
print(date == parser.parse("October 4th, 2022"))
print(date.strftime('%Y'))

2022-10-04 00:00:00
True
2022


In [18]:
## numpy: not the best for precision (64-bit)
import numpy as np
date = np.array('2022-10-04', dtype=np.datetime64)
print(f'date: {date}')
print(f'date, arange: {date + np.arange(12)}')

date: 2022-10-04
date, arange: ['2022-10-04' '2022-10-05' '2022-10-06' '2022-10-07' '2022-10-08'
 '2022-10-09' '2022-10-10' '2022-10-11' '2022-10-12' '2022-10-13'
 '2022-10-14' '2022-10-15']


In [41]:
## pandas
date = pd.to_datetime("October 4th, 2022")
print(date)
print(date.strftime('%A'))
print(date + pd.to_timedelta(np.arange(12), 'D'))

print()

## index data by timestamp
index = pd.DatetimeIndex(['2007-02-03', '2004-07-21', '1971-01-13', '1974-08-15'])
birthdays = pd.Series(['Molly', 'Taylor', 'Felix', 'Maggie'], index=index)
print(birthdays)
print(birthdays['2007'])
print(index.to_period('Y'))   # PeriodIndex from DatetimeIndex
print(index - index[0])   # TimedeltaIndex from DatetimeIndex
print(f"Date Range: {pd.date_range('2022-05-18', periods=4, freq='M')}")



2022-10-04 00:00:00
Tuesday
DatetimeIndex(['2022-10-04', '2022-10-05', '2022-10-06', '2022-10-07',
               '2022-10-08', '2022-10-09', '2022-10-10', '2022-10-11',
               '2022-10-12', '2022-10-13', '2022-10-14', '2022-10-15'],
              dtype='datetime64[ns]', freq=None)

2007-02-03     Molly
2004-07-21    Taylor
1971-01-13     Felix
1974-08-15    Maggie
dtype: object
2007-02-03    Molly
dtype: object
PeriodIndex(['2007', '2004', '1971', '1974'], dtype='period[A-DEC]')
TimedeltaIndex(['0 days', '-927 days', '-13170 days', '-11860 days'], dtype='timedelta64[ns]', freq=None)
Date Range: DatetimeIndex(['2022-05-31', '2022-06-30', '2022-07-31', '2022-08-31'], dtype='datetime64[ns]', freq='M')


ModuleNotFoundError: No module named 'pandas_datareader'

## Eval()

In [46]:
## pd.eval('') runs operations much more quickly
import math

%timeit math.factorial(25)
%timeit pd.eval('math.factorial(25)')

## df.eval() can do the same thing as pd.eval() in less code

398 ns ± 33.1 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
418 µs ± 60.8 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
