In [1]:
# Data Indexing and Selection 

# indexing          arr[1,2]
# slicing           arr[1:2:3]
# masking           arr[arr > 0]
# fancy indexing    arr[0,[1,5]]
# combination       arr[:, [1,5]]

In [2]:
# series as dictionaries 

import pandas as pd 
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=["a","b","c","d"])
data 

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [3]:
data["b"]

0.5

In [4]:
"a" in data 

True

In [5]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [6]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [7]:
data["e"] = 1.25
data 

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [8]:
# series as one-dimensional array 

# slicing by explicit index 
data["a":"c"]

a    0.25
b    0.50
c    0.75
dtype: float64

In [9]:
# slicing by implicit integer index 
data[0:2]

a    0.25
b    0.50
dtype: float64

In [11]:
# masking 
data[ (data > 0.3) & (data < 0.8)  ]

b    0.50
c    0.75
dtype: float64

In [12]:
# fancy indexing 
data[["a","e"]]

a    0.25
e    1.25
dtype: float64

In [13]:
# indexers: loc - explicit, iloc - implicit
# explicit (index=..) when indexing
# implicint when slicing => problem of reading

data = pd.Series(["a","b","c"], index=[1,3,5])
data 

1    a
3    b
5    c
dtype: object

In [14]:
# explicit index when indexing
data[1]

'a'

In [15]:
# implicit index when slicing
data[1:3]

3    b
5    c
dtype: object

In [17]:
# explicit (index=..)
data.loc[1]

'a'

In [19]:
# implicing (ordered)
data.iloc[1]

'b'

In [20]:
# DataFrame as Dictionary
area = pd.Series({'California': 423967, 'Texas': 695662,
    'Florida': 170312, 'New York': 141297,
    'Pennsylvania': 119280})
pop = pd.Series({'California': 39538223, 'Texas': 29145505,
    'Florida': 21538187, 'New York': 20201249,
    'Pennsylvania': 13002700})


In [22]:
data = pd.DataFrame({"area":area, "pop":pop})
data 

Unnamed: 0,area,pop
California,423967,39538223
Texas,695662,29145505
Florida,170312,21538187
New York,141297,20201249
Pennsylvania,119280,13002700


In [23]:
# series can be accessed via dictionary indexing - for the column name 
# dictionary indexing by column 
# array indexing by row-column
data["area"]

California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
Name: area, dtype: int64

In [24]:
data.area  # not recommended

California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
Name: area, dtype: int64

In [25]:
data.pop is data["pop"]     # method vs attribute

False

In [26]:
data["density"] = data["pop"] / data["area"]    # add new column 
data 

Unnamed: 0,area,pop,density
California,423967,39538223,93.257784
Texas,695662,29145505,41.896072
Florida,170312,21538187,126.463121
New York,141297,20201249,142.97012
Pennsylvania,119280,13002700,109.009893


In [27]:
# DataFrame as Two-Dimensional Array

data.values 

array([[4.23967000e+05, 3.95382230e+07, 9.32577842e+01],
       [6.95662000e+05, 2.91455050e+07, 4.18960717e+01],
       [1.70312000e+05, 2.15381870e+07, 1.26463121e+02],
       [1.41297000e+05, 2.02012490e+07, 1.42970120e+02],
       [1.19280000e+05, 1.30027000e+07, 1.09009893e+02]])

In [28]:
data.T 

Unnamed: 0,California,Texas,Florida,New York,Pennsylvania
area,423967.0,695662.0,170312.0,141297.0,119280.0
pop,39538220.0,29145500.0,21538190.0,20201250.0,13002700.0
density,93.25778,41.89607,126.4631,142.9701,109.0099


In [29]:
# pass single index into an array of .values will address the row:
data.values[0]

array([4.23967000e+05, 3.95382230e+07, 9.32577842e+01])

In [30]:
# passing a single "index" to a DataFrmae will address a column:
data["area"]

California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
Name: area, dtype: int64

In [31]:
data.iloc[:3,:2]

Unnamed: 0,area,pop
California,423967,39538223
Texas,695662,29145505
Florida,170312,21538187


In [32]:
data.loc[:"Florida", :"pop"]

Unnamed: 0,area,pop
California,423967,39538223
Texas,695662,29145505
Florida,170312,21538187


In [33]:
# combine masking and fancy indexing as follow:
data.loc[
    # row where
    data.density > 120, ["pop","density"]
    #                   select columns
]

Unnamed: 0,pop,density
Florida,21538187,126.463121
New York,20201249,142.97012


In [35]:
# modification 
data.iloc[0,2] = 999
data 

Unnamed: 0,area,pop,density
California,423967,39538223,999.0
Texas,695662,29145505,41.896072
Florida,170312,21538187,126.463121
New York,141297,20201249,142.97012
Pennsylvania,119280,13002700,109.009893


In [36]:
# additional indexing conventions

# in the SLICING we have 
# addressed the rows:
data["Florida":"New York"]

Unnamed: 0,area,pop,density
Florida,170312,21538187,126.463121
New York,141297,20201249,142.97012


In [37]:
# slicing with the numbers (row numbers):
data[1:3]

Unnamed: 0,area,pop,density
Texas,695662,29145505,41.896072
Florida,170312,21538187,126.463121


In [38]:
# masking operations also will refer to row-wise
data[data.density > 120]

Unnamed: 0,area,pop,density
California,423967,39538223,999.0
Florida,170312,21538187,126.463121
New York,141297,20201249,142.97012
