## Pandas : 
##### It contains data structures and data manipulation tools designed to make data cleaning and analysis fast and easy in Python. It is designed for working with tabular or heterogeneous data. 
##### NB : To apply some methods assign a new pd object  to the original one or use the `inplace=True` keyword

In [60]:
import pandas as pd
import numpy as np

In [61]:
# Series

obj = pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [62]:
# getting the values of a pandas series
obj.values

array([ 4,  7, -5,  3])

In [63]:
# getting the index of the pandas series object
obj.index

RangeIndex(start=0, stop=4, step=1)

In [64]:
# creating a series obj and providing its index values
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [65]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [66]:
obj2.values

array([ 4,  7, -5,  3])

In [67]:
obj2['a']

-5

In [68]:
obj2['d'] 

4

In [69]:
obj2['c']

3

In [70]:
obj2[['c', 'd', 'a']]

c    3
d    4
a   -5
dtype: int64

In [71]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [72]:
# numpy operations do not change the index format
obj2[obj2 > 2]

d    4
b    7
c    3
dtype: int64

In [73]:
obj2 * 2

d     8
b    14
a   -10
c     6
dtype: int64

In [74]:
np.exp(obj2)

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [75]:
'b' in obj2

True

In [76]:
'e' in obj2

False

#### Python dictionary can be used to create a pandas Series

In [77]:
sdata = {'Ohio' : 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [78]:
obj3 = pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [79]:
obj3.values

array([35000, 71000, 16000,  5000])

In [80]:
# The index of the series can be re-ordered
states = ['Californai', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata, index=states)
obj4

Californai        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [81]:
# Using isnull() and notnull() to detect missing values
obj4.isnull()

Californai     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [82]:
pd.isnull(obj4)

Californai     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [83]:
pd.notnull(obj4)

Californai    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [84]:
obj4.notnull()

Californai    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [85]:
obj3


Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [86]:
obj4

Californai        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [87]:
obj3 + obj4

Californai         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [88]:
obj4.name = 'population'
obj4.index.name = 'state'

In [89]:
obj4

state
Californai        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [90]:
# index values can be changed in-place
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

### DataFrame

In [91]:
data = {
    'state' : ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
    'year' : [2000, 2001, 2002, 2001, 2002, 2003],
    'pop' : [1.5, 1.7, 3.6, 2.4, 2.9, 3.2],
}
frame = pd.DataFrame(data)

In [92]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [93]:
frame2 = pd.DataFrame([np.arange(1, 7)], index=['one', 'two', 'three', 'four'])
frame2

Unnamed: 0,0,1,2,3,4,5
one,1,2,3,4,5,6
two,1,2,3,4,5,6
three,1,2,3,4,5,6
four,1,2,3,4,5,6


In [94]:
# using head() function to select the 1st 5 rows
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [95]:
pd.DataFrame(data, columns=['pop', 'year', 'state'])

Unnamed: 0,pop,year,state
0,1.5,2000,Ohio
1,1.7,2001,Ohio
2,3.6,2002,Ohio
3,2.4,2001,Nevada
4,2.9,2002,Nevada
5,3.2,2003,Nevada


In [96]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                     index=['one', 'two', 'three', 'four', 'five', 'six'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [97]:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [98]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [99]:
frame2.pop

<bound method NDFrame.pop of        year   state  pop debt
one    2000    Ohio  1.5  NaN
two    2001    Ohio  1.7  NaN
three  2002    Ohio  3.6  NaN
four   2001  Nevada  2.4  NaN
five   2002  Nevada  2.9  NaN
six    2003  Nevada  3.2  NaN>

In [100]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [101]:
# retreiving rows
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [102]:
frame2['debt'] = 16.5

In [103]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [104]:
frame2.debt = np.arange(6)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0
two,2001,Ohio,1.7,1
three,2002,Ohio,3.6,2
four,2001,Nevada,2.4,3
five,2002,Nevada,2.9,4
six,2003,Nevada,3.2,5


In [105]:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
val

two    -1.2
four   -1.5
five   -1.7
dtype: float64

In [106]:
frame2.debt = val
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [107]:
# deleting a column with del()
frame2['estern'] = frame2.state == 'Ohio'
frame2

Unnamed: 0,year,state,pop,debt,estern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


In [108]:
del frame2['estern']
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [109]:
pop = {
    'navada' : {2001 : 2.4, 2002:2.9},
    'Ohio' : {2000 : 1.5, 2001 : 1.7, 2002 : 3.6}
      }

frame3 = pd.DataFrame(pop)
frame3

Unnamed: 0,navada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [110]:
# use transpose or (dataframe.T)  to swap rows and columns
frame3.T

Unnamed: 0,2001,2002,2000
navada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [111]:
frame3.values

array([[2.4, 1.7],
       [2.9, 3.6],
       [nan, 1.5]])

In [112]:
frame3.index

Int64Index([2001, 2002, 2000], dtype='int64')

###  Index Objects

In [113]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])
obj

a    0
b    1
c    2
dtype: int64

In [114]:
index = obj.index

In [115]:
index

Index(['a', 'b', 'c'], dtype='object')

In [116]:
index[1:]

Index(['b', 'c'], dtype='object')

In [117]:
# index are immutable and can not be change in-place

In [118]:
index[1] = 'd' # TypeError

TypeError: Index does not support mutable operations

# Reindexing

In [134]:
obj = pd.Series([5.3, 7.2, -10.9, 8.5], index=['d', 'b', 'a', 'c'])
obj

d     5.3
b     7.2
a   -10.9
c     8.5
dtype: float64

In [135]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -10.9
b     7.2
c     8.5
d     5.3
e     NaN
dtype: float64

In [136]:
obj3 = pd.Series(['Yellow', 'Purplr', 'Blue'], index=[0,2,4])
obj3

0    Yellow
2    Purplr
4      Blue
dtype: object

In [137]:
# reindexing and using the ffill method 
obj3.reindex(range(6), method='ffill')

0    Yellow
1    Yellow
2    Purplr
3    Purplr
4      Blue
5      Blue
dtype: object

In [138]:
frame = pd.DataFrame(np.arange(9).reshape(3,3), 
                    index = ['a', 'c', 'd'], 
                    columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [139]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [140]:
state = ['Texas', 'Utah','California']

In [141]:
frame.reindex(columns=states)
# frame

Unnamed: 0,Californai,Ohio,Oregon,Texas
a,,0,,1
c,,3,,4
d,,6,,7


In [156]:
# the drop method can be used to entries in a pandas series

obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [157]:
# applying the drop method to remove entry 'C', 
new_obj = obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [158]:
obj.drop(['d','c'])


a    0.0
b    1.0
e    4.0
dtype: float64

In [159]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [162]:
# applying the drop method to a pd DataFrame object
data = pd.DataFrame(np.arange(16).reshape(4,4), 
                   index=['Ohio', 'Colorado', 'Utah', 'New york'],
                   columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New york,12,13,14,15


In [163]:
data.drop(['Utah', 'New york'])

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [164]:
# drop enterie by column name
data.drop('two', axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New york,12,14,15


In [165]:
data.drop(['two', 'four'], axis=1)

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New york,12,14


In [166]:
# using the inplace attribute to modify the object
obj.drop('c', inplace=True)


KeyError: "['c'] not found in axis"

In [170]:
data.drop(['two'], axis=1,  inplace=True)
data

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New york,12,14,15


In [171]:
data.drop('Ohio', inplace=True)
data

Unnamed: 0,one,three,four
Colorado,4,6,7
Utah,8,10,11
New york,12,14,15
