# Pandas
#### Chapter 5 (Python Data Analysis)

---
# Data Structures

In [6]:
import pandas as pd
from pandas import Series, DataFrame

## 1. Series

In [2]:
s = Series([2,5,1,6])
s

0    2
1    5
2    1
3    6
dtype: int64

In [3]:
s.values

array([2, 5, 1, 6])

In [4]:
s.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
s2 = Series([2, 5, 9, 1], index=['a','b','c','d'])
s2

a    2
b    5
c    9
d    1
dtype: int64

In [7]:
s2['b']

5

In [8]:
s2[['b', 'a']]

b    5
a    2
dtype: int64

In [9]:
s2[s2 > 2]

b    5
c    9
dtype: int64

## 2. DataFrame

In [4]:
data = {'City': ["Ibd", "Lhr", "Khi", "Pew", "Quetta"],
       'Province': ["Fedral", "Punjab", "Sindh", "KPK", "Balochistan"]}
print(type(data))
print(data)

<class 'dict'>
{'City': ['Ibd', 'Lhr', 'Khi', 'Pew', 'Quetta'], 'Province': ['Fedral', 'Punjab', 'Sindh', 'KPK', 'Balochistan']}


In [7]:
frame = pd.DataFrame(data)
print(type(frame))
frame

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,City,Province
0,Ibd,Fedral
1,Lhr,Punjab
2,Khi,Sindh
3,Pew,KPK
4,Quetta,Balochistan


A column can be retreived as Series by dict-like index or attribute

In [11]:
type(frame['City'])

pandas.core.series.Series

In [14]:
frame.City

0       Ibd
1       Lhr
2       Khi
3       Pew
4    Quetta
Name: City, dtype: object

In [16]:
frame.loc[1]

City           Lhr
Province    Punjab
Name: 1, dtype: object

In [2]:
import numpy as np

In [23]:
np.arange(100, 500, 100)

array([100, 200, 300, 400])

### Creating a new column

In [8]:
frame['population'] = np.arange(0, 500, 100)
frame

Unnamed: 0,City,Province,population
0,Ibd,Fedral,0
1,Lhr,Punjab,100
2,Khi,Sindh,200
3,Pew,KPK,300
4,Quetta,Balochistan,400


In [9]:
frame.columns

Index(['City', 'Province', 'population'], dtype='object')

### use del to delete columns

In [10]:
del frame['population']
frame.columns

Index(['City', 'Province'], dtype='object')

### dict of dicts

#### Outer dict: columns
#### Inner keys as row index

In [11]:
data = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame2 = pd.DataFrame(data)
frame2

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


#### Transpose just like numpy

In [12]:
frame2.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


#### values attribute returns dataframe as a 2d array

In [20]:
frame2

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [19]:
frame2.values

array([[nan, 1.5],
       [2.4, 1.7],
       [2.9, 3.6]])

In [21]:
frame.values

array([['Ibd', 'Fedral'],
       ['Lhr', 'Punjab'],
       ['Khi', 'Sindh'],
       ['Pew', 'KPK'],
       ['Quetta', 'Balochistan']], dtype=object)

## 3. Index Objects
For holding metadata (axis name etc.)

In [23]:
obj = pd.Series(range(3), index=['a','b', 'c'])
obj

a    0
b    1
c    2
dtype: int64

In [25]:
index = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [26]:
index[1:]

Index(['b', 'c'], dtype='object')

Index objects are immutable (can't be modified by user).

Makes it safer to share among data structures

In [28]:
labels = pd.Index(np.arange(3))
labels

Int64Index([0, 1, 2], dtype='int64')

In [29]:
obj2 = pd.Series([1.5, -2.5, 0], index=labels)
obj2

0    1.5
1   -2.5
2    0.0
dtype: float64

In [31]:
obj2.index is labels

True

Pandas index can contain duplicates

In [32]:
dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])
dup_labels

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

---
# Essential Functionality

### Reindexing

Create new object with data conformed to a new index.

In [37]:
obj = pd.Series(range(1,5), index=['d', 'b', 'a', 'c'])
obj

d    1
b    2
a    3
c    4
dtype: int64

In [38]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a    3.0
b    2.0
c    4.0
d    1.0
e    NaN
dtype: float64

Use`ffill` (forward fill) to fill missing values, interpolation

In [40]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [41]:
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

`reindex` can alter row, columns or both. Default = rows

In [42]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                     index=['a', 'c', 'd'],
                     columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [43]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


Columns can be indexed by columns keyword

In [44]:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [46]:
frame.loc[['a', 'b', 'c', 'd'], states]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


### Dropping Entries from an Axis

In [47]:
obj

d    1
b    2
a    3
c    4
dtype: int64

In [48]:
obj.drop('c')

d    1
b    2
a    3
dtype: int64

In [49]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [50]:
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [51]:
data.drop('two', axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [52]:
data.drop('three', axis='columns')

Unnamed: 0,one,two,four
Ohio,0,1,3
Colorado,4,5,7
Utah,8,9,11
New York,12,13,15


Can be done in-place by `inplace=True`

## Indexing, Selection, and Filtering

Series indexing works linke numpy indexing

In [53]:
obj

d    1
b    2
a    3
c    4
dtype: int64

In [55]:
obj[obj%2==0]

b    2
c    4
dtype: int64

In [54]:
# when slicing with labels, end-point is included in output
obj['b':'c']

b    2
a    3
c    4
dtype: int64

In [56]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [57]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64

In [58]:
data[['two', 'four']]

Unnamed: 0,two,four
Ohio,1,3
Colorado,5,7
Utah,9,11
New York,13,15


In [62]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


#### Selection with loc and iloc
Select a subset of rows and columns froma dataframe

`loc` for axis labels
`iloc` for integers

In [64]:
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int64

In [65]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [66]:
data.iloc[1, [1, 3]]

two     5
four    7
Name: Colorado, dtype: int64

In [69]:
data.iloc[:, :3][data.three > 5]

Unnamed: 0,one,two,three
Colorado,4,5,6
Utah,8,9,10
New York,12,13,14
