# Pandas
## How to use

In [195]:
import pandas as pd
import numpy as np

In [196]:
s_data = pd.Series([0.25, 0.50, 0.75, 1.0])

In [197]:
print(s_data)
print(type(s_data))

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64
<class 'pandas.core.series.Series'>


In [198]:
s_data[1]

0.5

In [199]:
print(s_data.values)
print(s_data.index)

[0.25 0.5  0.75 1.  ]
RangeIndex(start=0, stop=4, step=1)


In [200]:
s_data = pd.Series([0.25, 0.50, 0.75, 1.0], index=['a', 'b', 'c', 'd'])

In [201]:
s_data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [202]:
s_data['b']

0.5

In [203]:
s_data = pd.Series([0.25, 0.50, 0.75, 1.0], index=[2, 5, 3, 7])

In [204]:
s_data

2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64

In [205]:
s_data[5]

0.5

### Series as a specialized dictionary

In [206]:
population_dict = {
    'California': 38332521,
    'Texas': 26448193,
    'New York': 19651127,
    'Florida': 19552860,
    'Illionis': 12882135
}

In [207]:
population = pd.Series(population_dict)

In [208]:
type(population)

pandas.core.series.Series

In [209]:
population['California']

38332521

In [210]:
population['Texas':'Florida']

Texas       26448193
New York    19651127
Florida     19552860
dtype: int64

In [211]:
population.sort_values()

Illionis      12882135
Florida       19552860
New York      19651127
Texas         26448193
California    38332521
dtype: int64

In [212]:
# Create Seris object using Scalar value
pd.Series(5, index=[100, 200, 300])

100    5
200    5
300    5
dtype: int64

In [213]:
pd.Series({2: 'a', 1: 'b', 3: 'c'}, index=[3, 2])

3    c
2    a
dtype: object

### Create data frame

In [214]:
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illionis      12882135
dtype: int64

In [215]:
area_dict = {
    'California': 423967,
    'Texas': 695692,
    'New York': 141297,
    'Florida': 170312,
    'Illionis': 149995
}

In [216]:
area = pd.Series(area_dict)

In [217]:
states = pd.DataFrame({
    'Population': population, 
    'Area': area
})

In [218]:
states

Unnamed: 0,Population,Area
California,38332521,423967
Texas,26448193,695692
New York,19651127,141297
Florida,19552860,170312
Illionis,12882135,149995


In [219]:
states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illionis'], dtype='object')

In [220]:
states.columns

Index(['Population', 'Area'], dtype='object')

In [221]:
states['Area']

California    423967
Texas         695692
New York      141297
Florida       170312
Illionis      149995
Name: Area, dtype: int64

### Constructing DataFrame object
- From a single Series object
- From a list of dictionaries
- From a dictionary of Series objects
- From a dimensional NumPy array
- From a NumPy stucted array

In [222]:
print(type(population))
print(type(states))

<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>


In [223]:
# Create a datafram with a list of dictionaries
data = [{
    'a': 1,
    'b': 2 * i
} for i in range(3)]

In [224]:
print(data)
print(type(data))

[{'a': 1, 'b': 0}, {'a': 1, 'b': 2}, {'a': 1, 'b': 4}]
<class 'list'>


In [225]:
print(pd.DataFrame(data))
print(type(pd.DataFrame(data)))

   a  b
0  1  0
1  1  2
2  1  4
<class 'pandas.core.frame.DataFrame'>


In [226]:
# Dataframe can handle missing values
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [227]:
# Creat Dataframe using two-dimensional NumPy array
pd.DataFrame(np.random.rand(3, 2),
             columns=['foo', 'bar'], 
             index=['a', 'b', 'c'])

Unnamed: 0,foo,bar
a,0.823871,0.454365
b,0.522143,0.248935
c,0.524139,0.081091


### Index as orderedd set

In [228]:
# Create an Index from the list of integers
ind = pd.Index([2, 3, 5, 7, 11])
ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')

In [229]:
ind[1]

3

In [230]:
# Index does not support mutable operations
ind[0] = 1

TypeError: Index does not support mutable operations

In [None]:
ind_a = pd.Index([1, 3, 5, 7, 9])
ind_b = pd.Index([2, 3, 5, 7, 11])

In [None]:
ind_a & ind_b

  ind_a & ind_b


Int64Index([3, 5, 7], dtype='int64')

In [None]:
ind_a.intersection(ind_b)

Int64Index([3, 5, 7], dtype='int64')

In [None]:
ind_a | ind_b

  ind_a | ind_b


Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [None]:
ind_a.union(ind_b)

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [None]:
ind_a ^ ind_b

  ind_a ^ ind_b


Int64Index([1, 2, 9, 11], dtype='int64')

In [None]:
ind_a.symmetric_difference(ind_b)

Int64Index([1, 2, 9, 11], dtype='int64')

### Data Indexing and Selection

- Series as dictionary

In [None]:
data = pd.Series([0.25, 0.50, 0.75, 1.0], index=['a', 'b', 'c', 'd'])

In [None]:
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [None]:
'a' in data

True

In [None]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [None]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [None]:
data['e'] = 1.25

In [None]:
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [None]:
# Slicing using explicit indexing
data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [None]:
# Slicing using implicit indexing
data[0:2]

a    0.25
b    0.50
dtype: float64

In [None]:
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [None]:
# Masking
data[(data > 0.3) & (data < 0.8)]

b    0.50
c    0.75
dtype: float64

In [None]:
# Fancy indexing, list of indices
data[['a', 'd']]

a    0.25
d    1.00
dtype: float64

### Indexer loc and iloc

In [None]:
data = pd.Series(
    ['a', 'b', 'c'], 
    index=[1, 3, 5]
)
data

1    a
3    b
5    c
dtype: object

In [None]:
# To access data using index it using explicit indexing
data[1]

'a'

In [None]:
# To access data using index it using implicit indexing
data[1:3]

3    b
5    c
dtype: object

In [None]:
# 'loc' use explcit indexing
data.loc[1]

'a'

In [None]:
# iloc use implicit indexing
data.iloc[1]

'b'

### Data Selection in dataframe

#### DataFram as a dictionary

In [None]:
pop = pd.Series({
    'California': 38332521,
    'Texas': 26448193,
    'New York': 19651127,
    'Florida': 19552860,
    'Illionis': 12882135
})

In [None]:
area = pd.Series({
    'California': 423967,
    'Texas': 695692,
    'New York': 141297,
    'Florida': 170312,
    'Illionis': 149995
})

In [None]:
data = pd.DataFrame({
    'area': area,
    'pop': pop,
})

In [None]:
data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695692,26448193
New York,141297,19651127
Florida,170312,19552860
Illionis,149995,12882135


In [None]:
# Dictinary stlye indexing
data['area']

California    423967
Texas         695692
New York      141297
Florida       170312
Illionis      149995
Name: area, dtype: int64

In [None]:
# Attribute type
data.area

California    423967
Texas         695692
New York      141297
Florida       170312
Illionis      149995
Name: area, dtype: int64

In [None]:
data.area is data['area']

True

In [None]:
data.pop

<bound method DataFrame.pop of               area       pop
California  423967  38332521
Texas       695692  26448193
New York    141297  19651127
Florida     170312  19552860
Illionis    149995  12882135>

In [None]:
data.pop is data['pop']

False

In [None]:
# Avoid using attribute style "data.pop"
data['pop']

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illionis      12882135
Name: pop, dtype: int64

In [None]:
data['dens'] = data['pop'] / data['area']

In [None]:
data

Unnamed: 0,area,pop,dens
California,423967,38332521,90.413926
Texas,695692,26448193,38.017101
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illionis,149995,12882135,85.883763


In [None]:
# Get the values of the dataframe
data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [6.95692000e+05, 2.64481930e+07, 3.80171010e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01]])

In [None]:
# Get first row
data.values[0]

array([4.23967000e+05, 3.83325210e+07, 9.04139261e+01])

In [None]:
# Slicing using implicit indexing
data.iloc[:3,:2]

Unnamed: 0,area,pop
California,423967,38332521
Texas,695692,26448193
New York,141297,19651127


In [None]:
# Explicit index include the final index
data.loc[:'New York', :'pop']

Unnamed: 0,area,pop
California,423967,38332521
Texas,695692,26448193
New York,141297,19651127


In [None]:
# 'loc' and 'iloc' can combine masking and fancy indexing
data.loc[data.dens > 100, ['pop', 'dens']]

Unnamed: 0,pop,dens
New York,19651127,139.076746
Florida,19552860,114.806121


In [None]:
# Modify dataframe using iloc
data.iloc[0, 2] = 90

In [None]:
data

Unnamed: 0,area,pop,dens
California,423967,38332521,90.0
Texas,695692,26448193,38.017101
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illionis,149995,12882135,85.883763


### Operating on data in Pandas

In [None]:
# Series
rng = np.random.RandomState(42)

ser = pd.Series(rng.randint(0, 10, 4))

ser

0    6
1    3
2    7
3    4
dtype: int32

In [None]:
df = pd.DataFrame(rng.randint(0, 10, (3, 4)), columns=['a', 'b', 'c', 'd'])
df

Unnamed: 0,a,b,c,d
0,6,3,8,2
1,4,2,6,4
2,8,6,1,3


In [None]:
# Index preserves in series
np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [None]:
# Index preserves in dataframe
np.sin(df * np.pi / 4)

Unnamed: 0,a,b,c,d
0,-1.0,0.707107,-2.449294e-16,1.0
1,1.224647e-16,1.0,-1.0,1.224647e-16
2,-2.449294e-16,-1.0,0.7071068,0.7071068


### Index alignment in series

In [None]:
area = pd.Series({
    'California': 423967,
    'New York': 141297,
    'Florida': 170312,
    'Illionis': 149995
})

In [None]:
population = pd.Series({
    'California': 38332521,
    'Texas': 26448193,
    'New York': 19651127,
    'Illionis': 12882135
})

In [None]:
area / population

California    0.011060
Florida            NaN
Illionis      0.011644
New York      0.007190
Texas              NaN
dtype: float64

In [None]:
area.index.union(population.index)

Index(['California', 'Florida', 'Illionis', 'New York', 'Texas'], dtype='object')

### Index alignment in dataframe

In [None]:
a = pd.DataFrame(rng.randint(0, 20, (2, 2)), columns=list('xy'))

In [None]:
a

Unnamed: 0,x,y
0,7,3
1,1,5


In [None]:
b = pd.DataFrame(rng.randint(0, 10, (3, 3)), columns=list('xyz'))

In [None]:
b

Unnamed: 0,x,y,z
0,5,9,3
1,5,1,9
2,1,9,3


In [None]:
a + b

Unnamed: 0,x,y,z
0,12.0,12.0,
1,6.0,6.0,
2,,,


In [None]:
fill = a.stack().mean()

In [None]:
a.add(b, fill_value=fill)

Unnamed: 0,x,y,z
0,12.0,12.0,7.0
1,6.0,6.0,13.0
2,5.0,13.0,7.0
