In [1]:
#Pandas : Library that contains high level datastructures and manipulation tools designed to make data analysis fast and easy in Python

#Pandas is built on top of NumPy and makes it easy to use NumPy-centric applications

In [2]:
#Datastructures in Pandas

In [3]:
#Series: One dimensional array like object containing an array of data (of any NumPy data type) and 
# + an associated array of data labels called its index

#DataFrame: Represents a tabular spreadsheet like data structure containing an ordered collection of columns each of which can be a different value type(numeric, string, boolean, etc)

In [4]:
#Series

In [5]:
import pandas as pd

In [6]:
from pandas import Series

In [7]:
obj = Series([4,5,-8,3])

In [8]:
obj   #Series displays the index on the left and values on the right

0    4
1    5
2   -8
3    3
dtype: int64

In [9]:
#Series with an index identfying each data point

In [14]:
obj2 = Series([4,7,-5,3], index=['d','b','a','c'])
print(obj2.index)
print("----------------------")
print(obj2)

Index(['d', 'b', 'a', 'c'], dtype='object')
----------------------
d    4
b    7
a   -5
c    3
dtype: int64


In [12]:
#Compared with a regular NumPy array you can use values in the index when selecting single values or a set of values

In [13]:
obj2['a']

-5

In [14]:
obj2['d']

4

In [19]:
#NumPy array operations such as filtering with a boolean array, scalar multiplication or applying math functions will preserve the index-value link

import numpy as np

In [20]:
obj2[obj2 > 0]

d    4
b    7
c    3
dtype: int64

In [21]:
obj2*2

d     8
b    14
a   -10
c     6
dtype: int64

In [22]:
np.exp(obj2)

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [23]:
#Another way to thin abour Series is as a fixed length , ordered dict, as its a mapping of index values to data values.
#It can be substituted into many functions that expect a dict

In [24]:
'b' in obj2

True

In [27]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 37889}
obj3 = Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah      37889
dtype: int64

In [28]:
#While reading from a dictionary we can also use index as list to get items in specific order

In [29]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = Series(sdata, index=states)

In [31]:
obj4    #If keys is not in the target dict, NaN will be returned


California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [32]:
#To check for null values 

In [33]:
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [34]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [36]:
#Can be called as series method as well

obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [37]:
#A critical series feature for many aplications is that it automatically aligns differently-indexed data in arithmatic operation 

In [38]:
obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [39]:
#Both the series object itself and its index have a name attribute, which integrates with other key areas of pandas functionality

In [40]:
obj4.name = 'population'

In [41]:
obj4.index.name = 'state'

In [42]:
#DataFrame

In [43]:
#Tabular spreadsheet like datatructure containing an ordered collection of columns each of which can be a different value type
#Numeric, String, Boolean

#Numerous ways to construct a dataframe 

## dict of equal length lists or NumPy arrays

In [17]:
data = {'state': ['Ohio'
,
'Ohio'
,
'Ohio'
,
'Nevada'
,
'Nevada'],

'year': [2000, 2001, 2002, 2001, 2002],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}

In [18]:
frame = pd.DataFrame(data)

In [19]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [20]:
pd.DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [21]:
#As with series if you pass a column that isnt contained in data, it will appear as NA values

In [22]:
df = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'])

In [23]:
df

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,


In [24]:
#Retrieving a column

df['state']

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
Name: state, dtype: object

In [25]:
#Retrieving a row

df.loc[3]

year       2001
state    Nevada
pop         2.4
debt        NaN
Name: 3, dtype: object

In [29]:
import numpy as np
#Columns can be modified by assignment  #adding values to a column

df['debt'] = np.arange(5.)

In [30]:
df

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,0.0
1,2001,Ohio,1.7,1.0
2,2002,Ohio,3.6,2.0
3,2001,Nevada,2.4,3.0
4,2002,Nevada,2.9,4.0


In [31]:
#When assigning lists or arrays to a column, the value length must match th length of the DataFrame
#If you assign series, it will be instead conformed exatly to the DataFrame's index, inserting missing values in any hole


In [32]:
val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
df['debt'] = val

In [33]:
val = Series([-1.2, -1.5, -1.7], index=[3,2,1])
df['debt'] = val

In [34]:
df

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,-1.7
2,2002,Ohio,3.6,-1.5
3,2001,Nevada,2.4,-1.2
4,2002,Nevada,2.9,


In [63]:
#Assigning a column that doesnt exist will create a new column.


df['eastern'] = df.state == 'Ohio'

In [64]:
df

Unnamed: 0,year,state,pop,debt,eastern
0,2000,Ohio,1.5,,True
1,2001,Ohio,1.7,,True
2,2002,Ohio,3.6,,True
3,2001,Nevada,2.4,,False
4,2002,Nevada,2.9,,False


In [65]:
del df['eastern']

In [66]:
df

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,


In [67]:
#Nested dict of dicts format

In [69]:
pop = {'Nevada': {2000: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2002: 3.6}}

In [71]:
df = pd.DataFrame(pop)

In [73]:
df

Unnamed: 0,Nevada,Ohio
2000,2.4,1.5
2002,2.9,3.6


In [74]:
df.T

Unnamed: 0,2000,2002
Nevada,2.4,2.9
Ohio,1.5,3.6


In [76]:
#Index Objects: Responsible for holding the axis labels and other metadata(like axis name or names)

In [77]:
obj = Series(range(3), index=['a','b','c'])
index = obj.index

In [78]:
obj

a    0
b    1
c    2
dtype: int64

In [79]:
index[1:]

Index(['b', 'c'], dtype='object')

In [80]:
#Index are immutable 

In [81]:
index = pd.Index(np.arange(3))

In [82]:
obj2 = Series([1.5, -2.5, 0], index=index)

In [83]:
obj2.index = index

In [84]:
obj2

0    1.5
1   -2.5
2    0.0
dtype: float64

In [85]:
#Reindexing

In [86]:
obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])

obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])



In [87]:
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [88]:
obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

In [89]:
##With DataFrame, reindex can alter either the row index, coolumns or both. 
## When passed just a sequence, the rows are reindexed in the result

In [90]:
df = pd.DataFrame(np.arange(9).reshape((3,3)),
                 index=['a', 'c', 'd'],
                 columns=['Ohio', 'Texas', 'California'])

In [91]:
df

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [93]:
df2 = df.reindex(['a', 'b', 'c', 'd'])
df2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [96]:
## The columns can be reindexed using columns keyword

states = ['Texas', 'Utah', 'California']
df.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [98]:
df.loc(['a', 'b', 'c', 'd'], states)

TypeError: __call__() takes from 1 to 2 positional arguments but 3 were given

In [100]:
#Dropping enteries from an axis

In [101]:
obj = Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [102]:
new_obj = obj.drop('c')

In [103]:
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [104]:
#Within Dataframes: Index values can be deleted from either axis

In [108]:
df = pd.DataFrame(np.arange(16).reshape((4,4)), index=['Ohio', 'Colorado', 'Utah', 'New York'])

In [109]:
df

Unnamed: 0,0,1,2,3
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [110]:
df.drop(['Colorado', 'Ohio'])

Unnamed: 0,0,1,2,3
Utah,8,9,10,11
New York,12,13,14,15


In [111]:
data.drop(2, axis=1)

Unnamed: 0,0,1,3
0,0,1,3
1,4,5,7
2,8,9,11
3,12,13,15


In [112]:
#Arithmatic and Data Alignment

One of the most important pandas features is the behavior of arithmetic between objects with
different indexes. When adding together objects, if any index pairs are not the same, the

respective index in the result will be the union of the index pairs

In [113]:
s1 = Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])

In [114]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [121]:
df1 = pd.DataFrame(np.arange(9.).reshape((3,3)), columns= list('bde'))
df2 = pd.DataFrame(np.arange(12.).reshape((4,3)), columns= list('beg'))

In [122]:
df1 + df2

Unnamed: 0,b,d,e,g
0,0.0,,3.0,
1,6.0,,9.0,
2,12.0,,15.0,
3,,,,


In [126]:
#Function application and mapping: NumPy functions(element-wis array methods) work fine with pands objects

In [127]:
df = pd.DataFrame(np.random.randn(4,3), columns=list('bde'))

In [129]:
np.cumsum(df)

Unnamed: 0,b,d,e
0,-0.687356,-1.193171,1.680895
1,-1.667322,-2.284095,2.943076
2,-2.237643,-2.4622,3.535241
3,-2.947065,-1.556733,1.80226


In [130]:
np.abs(df)

Unnamed: 0,b,d,e
0,0.687356,1.193171,1.680895
1,0.979966,1.090924,1.262181
2,0.570321,0.178105,0.592165
3,0.709422,0.905467,1.732982


In [131]:
#Sorting and Ranking: Sorting a dataset by some criterion is another important built-in operation. To sort lexicographically by row or column index
#Use sort_index method

In [132]:
obj = Series(range(4), index=['d', 'a', 'b', 'c'])

In [133]:
#Sort by index
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [135]:
#Sort by value
obj.sort_value()

AttributeError: 'Series' object has no attribute 'sort_value'

In [137]:
df = pd.DataFrame(np.random.randn(4,3), columns=list('xbe'))

In [138]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,x,e,b
0,1.884188,-0.624127,-0.043683
1,-0.308851,0.360142,0.200448
2,0.036796,0.359965,-1.432557
3,1.670994,-1.413439,0.505566


In [139]:
df.sort_values(by=['x'])

Unnamed: 0,x,b,e
1,-0.308851,0.200448,0.360142
2,0.036796,-1.432557,0.359965
3,1.670994,0.505566,-1.413439
0,1.884188,-0.043683,-0.624127


In [140]:
#Data Loading, Storage, and File Formats

In [144]:
df = pd.read_csv('titanic.csv', sep=',')

In [145]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [146]:
#Read w/o header
pd.read_csv('titanic.csv', header= None)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
2,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
3,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
4,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
...,...,...,...,...,...,...,...,...,...,...,...,...
887,887,0,2,"Montvila, Rev. Juozas",male,27,0,0,211536,13,,S
888,888,1,1,"Graham, Miss. Margaret Edith",female,19,0,0,112053,30,B42,S
889,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
890,890,1,1,"Behr, Mr. Karl Howell",male,26,0,0,111369,30,C148,C


In [147]:
#Add headers

df = pd.read_csv('titanic.csv', names=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k' ])

In [148]:
df

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,k
PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27,0,0,211536,13,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19,0,0,112053,30,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
890,1,1,"Behr, Mr. Karl Howell",male,26,0,0,111369,30,C148,C


In [149]:
#Grouping

In [150]:
parsed = 

SyntaxError: invalid syntax (<ipython-input-150-1e17b3a68be6>, line 1)

In [None]:
#Writing a csv file
parsed.to_csv('output.csv')

#Other delimiters
parsed.to_csv(sys.stdout, sep='|')

In [2]:
import pandas as pd

In [3]:
#Iterating through DataFrame

df = {'Name': ['Ankit','Amit','Aishwarya','Priyanka'],'Age': [21, 19, 20, 18],'Stream': ['Math','Commerce','Arts','Biology'],'Percentage': [88, 92, 95, 70]}

In [4]:
#Convert the dictionary into DataFrame
df = pd.DataFrame(df, columns= ['Name', 'Age', 'Stream', 'Percentage'])

for index, row in df.iterrows():
    print(index, row['Name'])

0 Ankit
1 Amit
2 Aishwarya
3 Priyanka


In [6]:
df.head()

Unnamed: 0,Name,Age,Stream,Percentage
0,Ankit,21,Math,88
1,Amit,19,Commerce,92
2,Aishwarya,20,Arts,95
3,Priyanka,18,Biology,70
