In [1]:
# introducing pandas objects 
import numpy as np 
import pandas as pd 

In [2]:
# the pandas series object 
# one dimensional array of indexed data 
# created from an array 
# explicit sequence of indices 

data = pd.Series([0.25, 0.5, 0.75, 1.0])
data 

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [3]:
data.values 

array([0.25, 0.5 , 0.75, 1.  ])

In [4]:
data.index 

RangeIndex(start=0, stop=4, step=1)

In [6]:
data[1], data[1:3]

(0.5,
 1    0.50
 2    0.75
 dtype: float64)

In [7]:
# series as generalized numpy array 

data = pd.Series([
    0.25,0.5,0.75,1.0
], index = [
    "a","b","c","d"
])
data 

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [8]:
data["b"]

0.5

In [9]:
data = pd.Series([0.25,0.5,0.75,1.0],index=[2,5,3,7])
data 

2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64

In [10]:
data[5]

0.5

In [11]:
# Series as Specialized Dictionary

population_dict = {
    "California":395, "Texas":291, "Florida":215, "New York":202, "Pennsylvania":130
}
population = pd.Series(population_dict)
population 

California      395
Texas           291
Florida         215
New York        202
Pennsylvania    130
dtype: int64

In [12]:
population["California"]

395

In [13]:
population["California":"Florida"]

California    395
Texas         291
Florida       215
dtype: int64

In [14]:
# Constructing Series Objects 
pd.Series([2,4,6])

0    2
1    4
2    6
dtype: int64

In [15]:
pd.Series(5, index=[100,200,300])

100    5
200    5
300    5
dtype: int64

In [17]:
pd.Series({
    2:"a", 1:"b", 3:"c"
}, index= [1,2])

1    b
2    a
dtype: object

In [19]:
# The Pandas DataFrame Object 

area_dict = {
    "California":423, "Texas":695, "Florida":170, "New York":141, "Pennsylvania":119
}
area = pd.Series(area_dict)
area 

California      423
Texas           695
Florida         170
New York        141
Pennsylvania    119
dtype: int64

In [20]:
# construct a single two-dimensional object containing this information:
# DataFrame is a multidimensional table for data selection and view creation 

states = pd.DataFrame({
    "population":population, 
    "area":area 
})
states 

Unnamed: 0,population,area
California,395,423
Texas,291,695
Florida,215,170
New York,202,141
Pennsylvania,130,119


In [22]:
# DataFrame has an index attribute: 
states.index 

Index(['California', 'Texas', 'Florida', 'New York', 'Pennsylvania'], dtype='object')

In [23]:
states.columns

Index(['population', 'area'], dtype='object')

In [24]:
# DataFrame as Specialized Dictionary

states["area"]      # will return a Series object for each column

# point of confusion:
# data[0] will return the first row 
# df[0] will return first column as a Series 

California      423
Texas           695
Florida         170
New York        141
Pennsylvania    119
Name: area, dtype: int64

In [25]:
# constructing DataFrame Object

# from a single Series object 
pd.DataFrame(population, columns=["population"])

Unnamed: 0,population
California,395
Texas,291
Florida,215
New York,202
Pennsylvania,130


In [26]:
# from a list of dicts 

data = [
    {"a":i, "b":2*i}
    for i in range(3)
]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [28]:
# if some of the values are missing 
# pandas will fill it the the Nan

pd.DataFrame([{"a":1,"b":2},{"b":3,"c":4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [29]:
# from a dictionary of Series objects 

pd.DataFrame({"population":population, "area":area})

Unnamed: 0,population,area
California,395,423
Texas,291,695
Florida,215,170
New York,202,141
Pennsylvania,130,119


In [30]:
# from a two-dimensional numpy array 

pd.DataFrame(np.random.rand(3,2), columns=["foo","bar"], index=["a","b","c"])

Unnamed: 0,foo,bar
a,0.744707,0.324904
b,0.429623,0.724364
c,0.921053,0.617194


In [31]:
# from a numpy structured array
A = np.zeros(3, dtype=[("a","i8"),("b","f8")])
A 

array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('a', '<i8'), ('b', '<f8')])

In [32]:
# The Pandas Index Object 

ind = pd.Index([2,3,5,7,11])
ind 

Index([2, 3, 5, 7, 11], dtype='int64')

In [33]:
# index as immutable array 

ind[1]

3

In [34]:
ind[::2]

Index([2, 5, 11], dtype='int64')

In [35]:
ind.size, ind.shape, ind.ndim, ind.dtype 

(5, (5,), 1, dtype('int64'))

In [36]:
try:
    ind[1] = 0
except:
    print("index is immutable")

index is immutable


In [37]:
# index as Ordered Set 

indA = pd.Index([1,3,5,7,9])
indB = pd.Index([2,3,5,7,11])

In [38]:
indA.intersection(indB)

Index([3, 5, 7], dtype='int64')

In [39]:
indA.union(indB)

Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [40]:
indA.symmetric_difference(indB)

Index([1, 2, 9, 11], dtype='int64')