In [1]:
import pandas as pd
import numpy  as np

## Pandas data structure "Series"

In [5]:
# Create Pandas datastructure "Series"
obj = pd.Series([4,5,-7,8])
obj

0    4
1    5
2   -7
3    8
dtype: int64

In [8]:
obj.values

array([ 4,  5, -7,  8], dtype=int64)

In [9]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [13]:
# Create pandas datasturcuter "Series" with user defined indes
obj2 = pd.Series([4,5,-6,7], index = ['a','b','c','d'])

In [14]:
# Print values of the series
obj2

a    4
b    5
c   -6
d    7
dtype: int64

In [15]:
# Print indexes of teh series
obj2.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [7]:
s =  pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])

In [8]:
print(s)

a    0.975727
b    0.286204
c   -2.397638
d   -0.830941
e   -1.060771
dtype: float64


In [9]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [16]:
# Using the index to select a value or a set of values

# Selecting value with index 'b'

obj2['b']

5

In [21]:
#Slecting values with index 'b', 'c','d'
obj2[['b','c','d']]

b    5
c   -6
d    7
dtype: int64

In [22]:
# Print values of obj2 which are greater than obj2
obj2[obj2 > 0]


a    4
b    5
d    7
dtype: int64

In [23]:
# Perform Scalar multipliation 
print(obj2 * 3)

a    12
b    15
c   -18
d    21
dtype: int64


In [24]:
# Using mathematical functions on obj2

print(np.exp(obj2))

a      54.598150
b     148.413159
c       0.002479
d    1096.633158
dtype: float64


In [25]:
# Another way to think about a Series is as a fixed-length, ordered dict, as it is a mapping
# of index values to data values. It can be used in many contexts where you might
# use a dict:

'b' in obj

True

In [26]:
'j' in obj

False

In [30]:
# If you have data contained in a Python dict, you can create a Series from it by passing the dict:

#defining dictionary sdata

sdata = {'Ohio':35000,'Texas':71000, 'Orgeon':16000,'Utah':5000}
obj3 = pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Orgeon    16000
Utah       5000
dtype: int64

In [33]:
# When you are only passing a dict, the index in the resulting Series will have the dict’s
#keys in sorted order. You can override this by passing the dict keys in the order you
#want them to appear in the resulting Series:

states =['California', 'Ohio','Texas','Utah']
obj4 = pd.Series(sdata, index=states)
obj4

California        NaN
Ohio          35000.0
Texas         71000.0
Utah           5000.0
dtype: float64

In [38]:
#The isnull and notnull functions in pandas should be used to detect missing data:

pd.isnull(obj4)


California     True
Ohio          False
Texas         False
Utah          False
dtype: bool

In [41]:
b = pd.notnull(obj4)
b

California    False
Ohio           True
Texas          True
Utah           True
dtype: bool

In [None]:
# Data alignment features will be addressed in more detail later. If you have experience with databases, you can think about
# this as being similar to a join operation.

In [42]:
# Both the Series object itself and its index have a name attribute, which integrates with other key areas of pandas
# functionality:

obj4.name = "State"
obj4

California        NaN
Ohio          35000.0
Texas         71000.0
Utah           5000.0
Name: State, dtype: float64

In [44]:
obj4.index.name = "Population"
obj4

Population
California        NaN
Ohio          35000.0
Texas         71000.0
Utah           5000.0
Name: State, dtype: float64

In [11]:
pd.Series(np.random.randn(5))

0    1.039366
1   -0.165336
2    0.501285
3   -0.000645
4    0.654856
dtype: float64

In [3]:
d = {"a":1,"d":5,"b":2,"c":3,"e":4}
print(d)

{'a': 1, 'd': 5, 'b': 2, 'c': 3, 'e': 4}


In [4]:
dp = pd.Series(d)
print(d)

{'a': 1, 'd': 5, 'b': 2, 'c': 3, 'e': 4}


In [5]:
dp1=pd.Series(d,index = ["a","e","f","h"])
print(dp1)

a    1.0
e    4.0
f    NaN
h    NaN
dtype: float64


In [19]:
dp2 = dp1+dp
print(dp2)

a    NaN
b    NaN
c    NaN
d    NaN
e    8.0
f    NaN
g    NaN
h    NaN
dtype: float64


# Dataframe

A DataFrame represents a rectangular table of data and contains an ordered collection
of columns, each of which can be a different value type (numeric, string,
boolean, etc.). The DataFrame has both a row and column index; it can be thought of
as a dict of Series all sharing the same index. Under the hood, the data is stored as one
or more two-dimensional blocks rather than a list, dict, or some other collection of
one-dimensional arrays. The exact details of DataFrame’s internals are outside the
scope of this book.



In [48]:
data = { "State": ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
         "Year" : [2000, 2001, 2002, 2001, 2002, 2003],
         "pop"  : [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
       }

In [53]:
frame = pd.DataFrame(data)
frame

Unnamed: 0,State,Year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [55]:
# Displaying first 5 elements of datagrame

frame.head()

Unnamed: 0,State,Year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [61]:
# If you specify a sequence of columns, the DataFrame’s columns will be arranged in that order:

frame = pd.DataFrame(data , columns= ['Year','State','pop'])

In [67]:
# If you pass a column that isn’t contained in the dict, it will appear with missing values in the result:
frame = pd.DataFrame(data , columns= ['Year','State','pop','debt'])
frame

Unnamed: 0,Year,State,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [70]:
# Adding index to dataframe

frame2 = pd.DataFrame(data, columns = ['Year','State','pop','debt'], index = ['one', 'two', 'three', 'four','five','six'])
frame2

Unnamed: 0,Year,State,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [71]:
frame2.columns

Index(['Year', 'State', 'pop', 'debt'], dtype='object')

In [74]:
#   Retrieving a column as a series by dict-like notation or by attribute:
frame2['State']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: State, dtype: object

In [75]:
#   Retrieving a column as a series by dict-like notation or by attribute:
frame2.State

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: State, dtype: object

In [77]:
frame2.Year

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: Year, dtype: int64

In [81]:
# Rows can also be retrieved by position or name with the special loc attribute (much more on this later):
frame2.loc['two']

Year     2001
State    Ohio
pop       1.7
debt      NaN
Name: two, dtype: object

In [88]:
#Columns can be modified by assignment. Example 1
frame2['debt'] = 16.5
frame2

Unnamed: 0,Year,State,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [90]:
#Columns can be modified by assignment.Example 2
frame2['debt'] = np.arange(6.)
frame2

Unnamed: 0,Year,State,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


In [94]:
# If you assign a Series, its labels will be realigned exactly to the DataFrame’s index, inserting missing values in any holes

# When you are assigning lists or arrays to a column, the value’s length must match the length of the DataFrame

val = pd.Series([-12,1.5,-1.8], index = ['one','three', 'four'])
val               

one     -12.0
three     1.5
four     -1.8
dtype: float64

In [96]:
frame2['debt'] = val
frame2

Unnamed: 0,Year,State,pop,debt
one,2000,Ohio,1.5,-12.0
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,1.5
four,2001,Nevada,2.4,-1.8
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [105]:
# additing coloumn eastern such that if coloum state == ohio then true else false

frame2['eastern'] = frame2.State =='Ohio'
frame2

Unnamed: 0,Year,State,pop,debt,eastern
one,2000,Ohio,1.5,-12.0,True
two,2001,Ohio,1.7,,True
three,2002,Ohio,3.6,1.5,True
four,2001,Nevada,2.4,-1.8,False
five,2002,Nevada,2.9,,False
six,2003,Nevada,3.2,,False


In [106]:
# Deleting column 
del frame2 ['eastern']
frame2.columns

Index(['Year', 'State', 'pop', 'debt'], dtype='object')

#   Unlike Python sets, a pandas Index can contain duplicate labels:
# Selections with duplicate labels will select all occurrences of that label.
# Each Index has a number of methods and properties for set logic, which answer other common questions about the data it contains. Some useful ones are summarized in See page 136 of book PY Data Analysis
Table 5-2.
Table 5-2. Some Index methods and properties
Method Description
append Concatenate with additional Index objects, producing a new Index
difference Compute set difference as an Index
intersection Compute set intersection
union Compute set union
isin Compute boolean array indicating whether each value is contained in the passed collection
delete Compute new Index with element at index i deleted
drop Compute new Index by deleting passed values
insert Compute new Index by inserting element at index i
is_monotonic Returns True if each element is greater than or equal to the previous element
is_unique Returns True if the Index has no duplicate values
unique Compute the array of unique values in the Index

# Handling Missing Data

In [3]:
string_data = pd.Series(['aardvrak','artichoke',np.nan,'avacado'])
string_data

0     aardvrak
1    artichoke
2          NaN
3      avacado
dtype: object

In [5]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [8]:
string_data[0] = None


In [9]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [10]:
# Filtering out missing data
from numpy import nan as NA


In [11]:
data = pd.Series([1,NA,3.5,NA,7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [14]:
#This is equivalent to:
data = data[data.notnull()]
data

0    1.0
2    3.5
4    7.0
dtype: float64

In [15]:
# With DataFrame objects, things are a bit more complex. You may want to drop rows
# or columns that are all NA or only those containing any NAs. dropna by default drops
# any row containing a missing value:

data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],[NA, NA, NA], [NA, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [16]:
cleanedup = data.dropna()
cleanedup

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [17]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [18]:
# Filling in Missing Data
df = pd.DataFrame(np.random.randn(7, 3))
df

Unnamed: 0,0,1,2
0,-0.606566,-1.787376,0.079762
1,-0.609878,0.680511,1.384483
2,-1.41487,-0.009431,-0.831754
3,-0.382141,0.089987,1.913981
4,0.426686,0.392578,-0.262397
5,0.204203,1.511146,0.312795
6,0.93293,-0.308803,-0.815864


In [20]:
df.iloc[:4,1] = NA
df.iloc[:2,2] = NA
df

Unnamed: 0,0,1,2
0,-0.606566,,
1,-0.609878,,
2,-1.41487,,-0.831754
3,-0.382141,,1.913981
4,0.426686,0.392578,-0.262397
5,0.204203,1.511146,0.312795
6,0.93293,-0.308803,-0.815864


In [21]:
df.dropna()

Unnamed: 0,0,1,2
4,0.426686,0.392578,-0.262397
5,0.204203,1.511146,0.312795
6,0.93293,-0.308803,-0.815864


In [22]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-1.41487,,-0.831754
3,-0.382141,,1.913981
4,0.426686,0.392578,-0.262397
5,0.204203,1.511146,0.312795
6,0.93293,-0.308803,-0.815864


In [27]:
#Filling In Missing Data
fill = df.fillna(0)
fill

Unnamed: 0,0,1,2
0,-0.606566,0.0,0.0
1,-0.609878,0.0,0.0
2,-1.41487,0.0,-0.831754
3,-0.382141,0.0,1.913981
4,0.426686,0.392578,-0.262397
5,0.204203,1.511146,0.312795
6,0.93293,-0.308803,-0.815864


In [29]:
#you can use a different fill value for each column:
fill = df.fillna({1: 0.7, 2: 1})
fill

Unnamed: 0,0,1,2
0,-0.606566,0.7,1.0
1,-0.609878,0.7,1.0
2,-1.41487,0.7,-0.831754
3,-0.382141,0.7,1.913981
4,0.426686,0.392578,-0.262397
5,0.204203,1.511146,0.312795
6,0.93293,-0.308803,-0.815864


# Data Transformations

In [40]:
#Removing Duplicates

data = data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'], 'k2': [1, 1, 2, 3, 3, 4, 4]})
data    

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [41]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [42]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [43]:
# Transforming Data Using a Function or Mapping

data = pd.DataFrame( { 'food': [  'bacon', 'pulled pork', 'bacon',
                                'Pastrami', 'corned beef', 'Bacon',
                                'pastrami', 'honey ham', 'nova lox'] ,
                   
                    'ounces' : [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [44]:
meat_to_animal = {'bacon': 'pig',
                  'pulled pork': 'pig',
                  'pastrami': 'cow',
                  'corned beef': 'cow',
                  'honey ham': 'pig',
                  'nova lox': 'salmon'                  
                 }

In [47]:
lowercased = data['food'].str.lower()
lowercased


0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [51]:
data['animal'] = lowercased.map(meat_to_animal)
data


Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [55]:
transform = lambda x: x[:4].upper()
#transform

In [53]:
data.index.map(transform)

TypeError: 'int' object is not subscriptable

In [56]:
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.029223,-0.007083,-0.036486,-0.032272
std,1.013496,0.986304,1.037513,1.002302
min,-3.538904,-2.577276,-3.702937,-2.96249
25%,-0.635329,-0.678677,-0.748491,-0.70309
50%,-0.001293,-0.021755,-0.03461,-0.006778
75%,0.705085,0.678201,0.737629,0.608998
max,3.095668,3.169746,2.810796,4.497859


In [59]:
col = data[2]
col

0      1.687298
1      0.155310
2     -0.454548
3      1.820148
4     -1.235255
         ...   
995   -1.339003
996    1.131039
997    1.130853
998    0.297483
999    0.617552
Name: 2, Length: 1000, dtype: float64

In [60]:
col[np.abs(col) > 3]

49    -3.702937
567   -3.431025
597   -3.048345
Name: 2, dtype: float64