In [2]:
import pandas as pd
import numpy as np

## Series

In [3]:
obj=pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [4]:
obj.array

<NumpyExtensionArray>
[np.int64(4), np.int64(7), np.int64(-5), np.int64(3)]
Length: 4, dtype: int64

In [5]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
obj2=pd.Series([4, 7, -5, 3],index=["a","b","c","d"])
obj2

a    4
b    7
c   -5
d    3
dtype: int64

In [7]:
obj2["a"]

np.int64(4)

In [8]:
obj2[["c","a","b"]]

c   -5
a    4
b    7
dtype: int64

In [13]:
obj2[obj2>0]

a    4
b    7
d    3
dtype: int64

In [14]:
obj2*2

a     8
b    14
c   -10
d     6
dtype: int64

In [15]:
np.exp(obj2)

a      54.598150
b    1096.633158
c       0.006738
d      20.085537
dtype: float64

In [17]:
"b" in obj2,"e" in obj2

(True, False)

In [18]:
sdata={"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}
obj3=pd.Series(sdata) # converting python dictionary to a Series
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [19]:
obj3.to_dict() # converting series to a python dictionary

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [24]:
states=["California", "Ohio", "Oregon", "Texas"]
obj4=pd.Series(sdata,index=states)
obj4
# Here, three values found in sdata were placed in the appropriate locations, but since no value for "California" was found, it appears as NaN (Not a Number), which is considered in pandas to mark missing or NA values. Since "Utah" was not included in states, it is excluded from the resulting object.

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [26]:
pd.isna(obj4),pd.notna(obj4) # The isna and notna functions in pandas should be used to detect missing data

(California     True
 Ohio          False
 Oregon        False
 Texas         False
 dtype: bool,
 California    False
 Ohio           True
 Oregon         True
 Texas          True
 dtype: bool)

In [27]:
obj4.isna()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [29]:
obj3,obj4

(Ohio      35000
 Texas     71000
 Oregon    16000
 Utah       5000
 dtype: int64,
 California        NaN
 Ohio          35000.0
 Oregon        16000.0
 Texas         71000.0
 dtype: float64)

In [33]:
(obj3+obj4)

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [34]:
obj4.name="Population"
obj4.index.name="state"
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: Population, dtype: float64

In [35]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [36]:
obj.index=["Bob", "Steve", "Jeff", "Ryan"]
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

## DataFrame

A DataFrame represents a rectangular table of data and contains an ordered, named collection of columns, each of which can be a different value type (numeric, string, Boolean, etc.). The DataFrame has both a row and column index; it can be thought of as a dictionary of Series all sharing the same index.

In [38]:
data={"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],"year":[2000, 2001, 2002, 2001, 2002, 2003],
      "pop":[1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame=pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [39]:
frame.head() # the head method selects only the first five rows

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [40]:
frame.tail() # tail returns the last five rows

Unnamed: 0,state,year,pop
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [41]:
pd.DataFrame(data,columns=["year","state","pop"]) # specify a sequence of columns, the DataFrame’s columns will be arranged in that order

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [42]:
pd.DataFrame(data,columns=["year", "state", "pop", "debt"])

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [47]:
frame["state"]


0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

In [52]:
frame.loc[1],frame.iloc[2] # Rows can also be retrieved by position or name with the special iloc and loc attributes

(state    Ohio
 year     2001
 pop       1.7
 Name: 1, dtype: object,
 state    Ohio
 year     2002
 pop       3.6
 Name: 2, dtype: object)

In [54]:
frame["debt"]=16.5
frame

Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,16.5
1,Ohio,2001,1.7,16.5
2,Ohio,2002,3.6,16.5
3,Nevada,2001,2.4,16.5
4,Nevada,2002,2.9,16.5
5,Nevada,2003,3.2,16.5


In [59]:
frame["debt"]=np.arange(6.)
frame

Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,0.0
1,Ohio,2001,1.7,1.0
2,Ohio,2002,3.6,2.0
3,Nevada,2001,2.4,3.0
4,Nevada,2002,2.9,4.0
5,Nevada,2003,3.2,5.0


In [62]:
val=pd.Series([-1.2, -1.5, -1.7],index=["two", "four", "five"])
frame["debt"]=val
frame

Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,
1,Ohio,2001,1.7,
2,Ohio,2002,3.6,
3,Nevada,2001,2.4,
4,Nevada,2002,2.9,
5,Nevada,2003,3.2,


In [66]:
frame["eastern"]=frame["state"]=="Ohio"
frame,frame.columns

(    state  year  pop  debt  eastern
 0    Ohio  2000  1.5   NaN     True
 1    Ohio  2001  1.7   NaN     True
 2    Ohio  2002  3.6   NaN     True
 3  Nevada  2001  2.4   NaN    False
 4  Nevada  2002  2.9   NaN    False
 5  Nevada  2003  3.2   NaN    False,
 Index(['state', 'year', 'pop', 'debt', 'eastern'], dtype='object'))

In [65]:
del frame["eastern"]
frame.columns

Index(['state', 'year', 'pop', 'debt'], dtype='object')

In [68]:
populations={"Ohio":{2000:1.5,2001: 1.7, 2002: 3.6},
             "Nevada":{2001: 2.4, 2002: 2.9}}
frame2=pd.DataFrame(populations)
frame2

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [69]:
frame2.T

Unnamed: 0,2000,2001,2002
Ohio,1.5,1.7,3.6
Nevada,,2.4,2.9


In [70]:
pd.DataFrame(populations,index=[2001,2002,2003])

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9
2003,,


In [72]:
pdata={"Ohio":frame2["Ohio"][:-1],
       "Nevada":frame2["Nevada"][:2]}
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4


In [73]:
frame2

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [75]:
frame2.index.name="year"
frame2.columns.name="state"
frame2

state,Ohio,Nevada
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [76]:
frame2.to_numpy()

array([[1.5, nan],
       [1.7, 2.4],
       [3.6, 2.9]])

In [77]:
frame.to_numpy()

array([['Ohio', 2000, 1.5, nan, True],
       ['Ohio', 2001, 1.7, nan, True],
       ['Ohio', 2002, 3.6, nan, True],
       ['Nevada', 2001, 2.4, nan, False],
       ['Nevada', 2002, 2.9, nan, False],
       ['Nevada', 2003, 3.2, nan, False]], dtype=object)

### Index Objects

In [81]:
obj=pd.Series(np.arange(3),index=["a","b","c"])
index=obj.index
index,index[1:] # Index objects are immutable and thus can’t be modified by the user:

(Index(['a', 'b', 'c'], dtype='object'), Index(['b', 'c'], dtype='object'))

In [83]:
labels=pd.Index(np.arange(3))
labels

Index([0, 1, 2], dtype='int64')

In [84]:
obj2=pd.Series([1.5,-2.5,0],index=labels)
obj2

0    1.5
1   -2.5
2    0.0
dtype: float64

In [85]:
obj2.index is labels

True

In [86]:
frame2

state,Ohio,Nevada
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [89]:
frame2.columns,"Ohio" in frame2,2003 in frame2.index

(Index(['Ohio', 'Nevada'], dtype='object', name='state'), True, False)

In [90]:
pd.Index(["foo", "foo", "bar", "bar"]) # Unlike Python sets, a pandas Index can contain duplicate labels

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

In [93]:
labels1=pd.Index(np.arange(5))
labels1

Index([0, 1, 2, 3, 4], dtype='int64')

In [95]:
labels1.append(labels) # Concatenate with additional Index objects, producing a new Index

Index([0, 1, 2, 3, 4, 0, 1, 2], dtype='int64')

In [97]:
labels1.difference(labels) # Compute set difference as an Index

Index([3, 4], dtype='int64')

In [98]:
labels1.intersection(labels) # Compute set intersection

Index([0, 1, 2], dtype='int64')

In [99]:
labels1.union(labels) # Compute set union

Index([0, 1, 2, 3, 4], dtype='int64')

## Essential Functionality