In [None]:
#Getting Started with pandas  pp123 McKinney 
#from NumPy, the biggest difference is that pandas is designed for working with tabular or heterogeneous data. 
#NumPy, by contrast, is best suited for working with homogeneous numerical array data. 

In [3]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
#5.1 Introduction to pandas Data Structures: Series and DataFrame
#A Series is a one-dimensional array-like object containing a sequence of values 
#(of similar types to NumPy types) and an associated array of data labels, called its index. 
#The simplest Series is formed from only an array of data:

In [4]:
obj = pd.Series([-43,3,5,68])
obj

0   -43
1     3
2     5
3    68
dtype: int64

In [5]:
print(obj.values)

[-43   3   5  68]


In [9]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
type(obj.values)

numpy.ndarray

In [8]:
type(obj.index)

pandas.core.indexes.range.RangeIndex

In [6]:
print(obj.index)

RangeIndex(start=0, stop=4, step=1)


In [11]:
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [12]:
#you can use labels in the index when selecting single values or a set of values:
obj2['a']  

-5

In [14]:
obj2['d'] = 6

In [15]:
obj2

d    6
b    7
a   -5
c    3
dtype: int64

In [16]:
obj2[['c', 'a', 'd']]

c    3
a   -5
d    6
dtype: int64

In [14]:
obj2>0

d     True
b     True
a    False
c     True
dtype: bool

In [15]:
obj2[obj2>2] #Note

d    6
b    7
c    3
dtype: int64

In [16]:
obj2*2

d    12
b    14
a   -10
c     6
dtype: int64

In [19]:
np.exp(obj2)

d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [20]:
#Another way to think about a Series is as a fixed-length, ordered dict, as it is a 
#mapping of index values to data values. 
'b' in obj2 

True

In [22]:
#From Python dict, you can create a Series from it by passing the dict: NOTE
sdata = {'Texas': 35000, 'Albama': 71000, 'Oregon': 16000, 'Utah': 5000}

In [23]:
sdata

{'Texas': 35000, 'Albama': 71000, 'Oregon': 16000, 'Utah': 5000}

In [19]:
type(sdata)

dict

In [25]:
obj3 = pd.Series(sdata)
obj3

Texas     35000
Albama    71000
Oregon    16000
Utah       5000
dtype: int64

In [20]:
type(obj3)

pandas.core.series.Series

In [26]:
#When you are only passing a dict, the index in the resulting Series will have 
#the dict’s keys in sorted order. You can override this
states = ['California', 'Delhi', 'Oregon', 'Texas', 'Utah']
obj4 = pd.Series(sdata, index=states)
obj4

California        NaN
Delhi             NaN
Oregon        16000.0
Texas         35000.0
Utah           5000.0
dtype: float64

In [27]:
#isnull and notnull functions in pandas should be used to detect missing data:
pd.isnull(obj4) 

California     True
Delhi          True
Oregon        False
Texas         False
Utah          False
dtype: bool

In [28]:
 pd.notnull(obj4)

California    False
Delhi         False
Oregon         True
Texas          True
Utah           True
dtype: bool

In [29]:
#Series also has these as instance methods:
obj4.isnull()

California     True
Delhi          True
Oregon        False
Texas         False
Utah          False
dtype: bool

In [27]:
#A useful Series feature for many applications is that it automatically aligns by index label in arithmetic operations:
obj3 

Texas     35000
Albama    71000
Oregon    16000
Utah       5000
dtype: int64

In [28]:
obj4

California        NaN
Delhi             NaN
Oregon        16000.0
Texas         35000.0
Utah           5000.0
dtype: float64

In [32]:
obj3+obj4
#As with databases,is being similar to a join operation

California         NaN
Delhi              NaN
Ohio               NaN
Oregon         32000.0
Texas         142000.0
Utah           10000.0
dtype: float64

In [6]:
obj4

California        NaN
Delhi             NaN
Oregon        16000.0
Texas         71000.0
Utah           5000.0
dtype: float64

In [7]:
#give a name to index and object
obj4.name = 'population'
obj4.index.name = 'state'

In [8]:
obj4

state
California        NaN
Delhi             NaN
Oregon        16000.0
Texas         71000.0
Utah           5000.0
Name: population, dtype: float64

In [None]:
""" pp128- A DataFrame represents a rectangular table of data and contains 
--an ordered collection of columns, each of which can be a different value type 
(numeric, string, boolean, etc.). 
--The DataFrame has both a row and column index; it can be thought of as a dict 
of Series all sharing the same index. 
--Under the hood, the data is stored as one or more two-dimensional blocks rather 
than a list, dict, or some other collection of one-dimensional arrays. 
--While a DataFrame is physically two-dimensional, you can use it to represent 
higher dimensional data."""

In [3]:
#There are many ways to construct a DataFrame, though one of the most common is 
#from a dict of equal-length lists or NumPy arrays:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]} 
frame = pd.DataFrame(data)
#The DataFrame have index assigned auto as with Series, and columns sorted:
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002
5,3.2,Nevada,2003


In [10]:
#For large DataFrames, the head method selects only the first five rows:
frame.head() 

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [11]:
#If sequence of columns specified, the DataFrame’s columns arranged in that order:
pd.DataFrame(data, columns=['year', 'state', 'pop']) 

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [4]:
#If you pass a column that isn’t contained in the dict, it will appear with missing values in the result:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],index=['one', 'two', 'three', 'four','five', 'six'])

In [13]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [14]:
#A column in a DataFrame can be retrieved as a Series either by dict-like notation or by attribute:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [15]:
frame2.state

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [None]:
#Rows can also be retrieved by position or name with the special loc attribute

In [16]:
 frame2.loc['three'] 

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [17]:
 frame2['debt'] = 16.5
 frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [18]:
frame2['debt'] = np.arange(6.)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


In [19]:
#When you are assigning lists or arrays to a column, the value’s length must match the length of the DataFrame. 
#If you assign a Series, its labels will be realigned exactly to the DataFrame’s index, inserting missing values in any holes:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
frame2 

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [None]:
#pp131 creating a new column

In [6]:
frame2['eastern'] = frame2.state == 'Ohio'
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,,False
five,2002,Nevada,2.9,,False
six,2003,Nevada,3.2,,False


In [7]:
#deleting a column
del frame2['eastern']
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [9]:
#Another method for creating a DataFrame
#common form of data is a nested dict of dicts:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}} 
pop

{'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [10]:
frame3 = pd.DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [11]:
#transpose the DataFrame (swap rows and columns) with similar syntax to a NumPy array:
frame3.T 

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [13]:
#Index Objects- pandas’s Index objects are responsible for holding 
#the axis labels and other metadata (like the axis name or names). 
#Any array or other sequence of labels when constructing a Series or DataFrame 
#is internally converted to an Index:
obj = pd.Series(range(3), index=['a', 'b', 'c'])
index = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [15]:
index[1:]

Index(['b', 'c'], dtype='object')

In [16]:
index[1] = 'd'  

TypeError: Index does not support mutable operations

In [13]:
#Chapter5-Pandas-5.2 Series & DataFrames-Essential Functionality - McKinney Part-2 (pp136)
# First importing a CSV file
#walk you through the fundamental mechanics of interacting with the data contained in a Series or DataFrame.
import os

In [None]:
#Reindexing
"""An important method on pandas objects is reindex, to create a new object with the data conformed to a new index"""

In [46]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [47]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [48]:
""" For ordered data like time series, may be desirable to do some interpolation or filling of values when reindexing. 
The method option allows us to do this, using a method such as ffill, which forward-fills the values:"""
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [49]:
 obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [50]:
#With DataFrame, reindex can alter either the (row) index, columns, or both. When passed only a sequence, 
#it reindexes the rows in the result:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),index=['a', 'c', 'd'],columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [52]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [53]:
#The columns can be reindexed with the columns keyword:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [54]:
# Explore in detail later, you can reindex more succinctly by label-indexing with loc, and many users prefer to use this:
frame.loc[['a', 'b', 'c', 'd'], states]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  


Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


In [None]:
#Dropping Entries from an Axis

In [5]:
import pandas as pd
import numpy as np
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [64]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
new_obj = obj.drop('a')
new_obj

b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [6]:
new_obj = obj.drop(['c','d'])
new_obj

a    0.0
b    1.0
e    4.0
dtype: float64

In [69]:
""" With DataFrame, index values can be deleted from either axis. To illustrate this, create an example DataFrame:"""
data = pd.DataFrame(np.arange(16).reshape((4, 4)),index=['Ohio', 'Colorado', 'Utah', 'New York'],columns=['one', 'two', 'three', 'four'])
data 

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [71]:
new_obj = data.drop(['one','three'], axis = 'columns')
new_obj

Unnamed: 0,two,four
Ohio,1,3
Colorado,5,7
Utah,9,11
New York,13,15


In [72]:
data #remains same

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [76]:
data.drop(['one','three'], axis = 'columns', inplace = True)


In [77]:
data # dropped altogether

Unnamed: 0,two,four
Ohio,1,3
Colorado,5,7
Utah,9,11
New York,13,15


In [None]:
# pp140  Indexing, Selection, and Filtering
#Series indexing (obj[...]) works analogously to NumPy array indexing, except Series’s index values can be used
# instead of only integers.

In [85]:
obj = pd.Series(np.arange(4.),index=['a','b','c', 'd'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [87]:
obj[1]

1.0

In [81]:
obj['b']

1.0

In [88]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [89]:
obj[obj < 2]

a    0.0
b    1.0
dtype: float64

In [90]:
obj['b':'c']

b    1.0
c    2.0
dtype: float64

In [91]:
obj['b':'c'] = 5

In [92]:
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

In [None]:
#Dataframes indexing

In [8]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
.....: index=['Ohio', 'Colorado', 'Utah', 'New York'],
.....: columns=['one', 'two', 'three', 'four'])

In [9]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [95]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [96]:
data[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [97]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [10]:
data[2:]

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [98]:
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [None]:
#Another use case is in indexing with a boolean DataFrame, such as one produced by a scalar comparison:

In [99]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [102]:
data[data < 5] = 0
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [None]:
""" Selection with loc and iloc
For DataFrame label-indexing on the rows, introduce the special indexing operators loc and iloc. 
They enable you to select a subset of the rows and columns from a DataFrame with NumPy-like notation 
using either axis labels (loc) or integers (iloc). """

In [103]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [104]:
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int32

In [105]:
data.iloc[2, [3, 0, 1]]

four    11
one      8
two      9
Name: Utah, dtype: int32

In [106]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [107]:
data.iloc[[1, 2], [3, 0, 1]]

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


In [108]:
data.loc[:'Utah', 'two']

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int32

In [110]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [111]:
data.three > 5

Ohio        False
Colorado     True
Utah         True
New York     True
Name: three, dtype: bool

In [113]:
data.iloc[:, :3]

Unnamed: 0,one,two,three
Ohio,0,0,0
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


In [114]:
data.iloc[:, :3][data.three > 5]  # ohio is out

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


In [None]:
pp144-152 Skipping

In [23]:
#Sorting and Ranking Sorting a dataset by some criterion is another important built-in operation.
#sort lexicographically by row or column index, use the sort_index method, which returns a new, sorted object:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index() 

a    1
b    2
c    3
d    0
dtype: int32

In [26]:
#With a DataFrame, you can sort by index on either axis:
frame = pd.DataFrame(np.arange(12).reshape((3, 4)),index=['two','three', 'one'],columns=['d', 'a', 'b', 'c'])
frame

Unnamed: 0,d,a,b,c
two,0,1,2,3
three,4,5,6,7
one,8,9,10,11


In [27]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,8,9,10,11
three,4,5,6,7
two,0,1,2,3


In [28]:
 frame.sort_index(axis=1) 

Unnamed: 0,a,b,c,d
two,1,2,3,0
three,5,6,7,4
one,9,10,11,8


In [29]:
frame.sort_index(axis=1, ascending=False) 

Unnamed: 0,d,c,b,a
two,0,3,2,1
three,4,7,6,5
one,8,11,10,9


In [30]:
obj = pd.Series([4, 7, -3, 2])
obj.sort_values() 

2   -3
3    2
0    4
1    7
dtype: int64

In [31]:
#Any missing values are sorted to the end of the Series by default:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values() 

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [32]:
#When sorting a DataFrame, you can use the data in one or more columns as the sort keys. To do so, pass one or more column names to the by option of sort_values:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame 

Unnamed: 0,a,b
0,0,4
1,1,7
2,0,-3
3,1,2


In [33]:
 frame.sort_values(by='b') 

Unnamed: 0,a,b
2,0,-3
3,1,2
0,0,4
1,1,7


In [34]:
 frame.sort_values(by=['a', 'b'])

Unnamed: 0,a,b
2,0,-3
0,0,4
3,1,2
1,1,7


In [39]:
#Ranking assigns ranks from one through the number of valid data points in an array. The rank methods for 
#Series and DataFrame are the place to look; by default rank breaks ties by assigning each group the mean rank:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj
obj.rank()  #Not clear

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [40]:
#DataFrame can compute ranks over the rows or the columns:
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],'c': [-2, 5, 8, -2.5]})


In [41]:
frame

Unnamed: 0,a,b,c
0,0,4.3,-2.0
1,1,7.0,5.0
2,0,-3.0,8.0
3,1,2.0,-2.5


In [43]:
 frame.rank(axis='columns')   # not clear ....tie breaks method

Unnamed: 0,a,b,c
0,2.0,3.0,1.0
1,1.0,3.0,2.0
2,2.0,1.0,3.0
3,2.0,3.0,1.0


In [None]:
#pp156

In [17]:
"""5.3 Summarizing and Computing Descriptive Statistics pandas objects are equipped with a set of common mathematical and 
statistical methods. Most of these fall into the category of reductions or summary statistics, methods that extract a 
single value (like the sum or mean) from a Series or a Series of values from the rows or columns of a DataFrame. 
Compared with the similar methods found on NumPy arrays, they have built-in handling for missing data. Consider a small DataFrame:"""
import pandas as pd
import numpy as np
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],[np.nan, np.nan], [0.75, -1.3]],index=['a', 'b', 'c', 'd'],columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [18]:
 df.sum() 

one    9.25
two   -5.80
dtype: float64

In [19]:
 df.sum(axis ='columns') 

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [8]:
#NA values are excluded unless the entire slice (row or column in this case) is NA. This can be disabled with the skipna option:
df.mean(axis='columns', skipna=False) 

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [None]:
#pp158...do till 165

In [None]:
#Some methods, like idxmin and idxmax, return indirect statistics like the index value where the minimum or maximum values are attained:
#Other methods are accumulations:
#Describe - multiple summary statistics in one shot:
In [237]: df.describe() 

In [20]:
df.idxmax()

one    b
two    d
dtype: object

In [22]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [21]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [23]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [25]:
#On non-numeric data, describe produces alternative summary statistics:
obj = pd.Series(['a', 'a', 'b', 'c','b'] * 4)
obj.describe() 

count     20
unique     3
top        b
freq       8
dtype: object

In [26]:
#Correlation and Covariance Some summary statistics, like correlation and covariance, are computed from pairs of arguments. 
#Consider some DataFrames of stock prices and volumes obtained from Yahoo! Finance using the add-on pandas-datareader package. 
#it can be obtained via conda or pip:conda install pandas-datareader.
# Or use the pandas_datareader module to download some data for a few stock tickers:
import pandas_datareader.data as web 
all_data = {ticker: web.get_data_yahoo(ticker) for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}


ModuleNotFoundError: No module named 'pandas_datareader'

In [None]:
price = pd.DataFrame({ticker: data['Adj Close']                     for ticker, data in all_data.items()}) volume = pd.DataFrame({ticker: data['Volume']                      for ticker, data in all_data.items()})


In [43]:
marks = {'Ajay': [95,80,99,98,97], 'Aman': [67,45,39,45,35], 'Abi':[99,89,96,93,95], 'Joy': [33,26,41,37,32]}

In [44]:
scores = pd.DataFrame(marks)
scores

Unnamed: 0,Abi,Ajay,Aman,Joy
0,99,95,67,33
1,89,80,45,26
2,96,99,39,41
3,93,98,45,37
4,95,97,35,32


In [45]:
scores = pd.DataFrame(marks, index = ['Maths', 'Phy', 'Chem', 'Bio', 'Hist'])
scores

Unnamed: 0,Abi,Ajay,Aman,Joy
Maths,99,95,67,33
Phy,89,80,45,26
Chem,96,99,39,41
Bio,93,98,45,37
Hist,95,97,35,32


In [46]:
scores.corr()

Unnamed: 0,Abi,Ajay,Aman,Joy
Abi,1.0,0.723103,0.46542,0.530706
Ajay,0.723103,1.0,-0.094627,0.858103
Aman,0.46542,-0.094627,1.0,-0.153541
Joy,0.530706,0.858103,-0.153541,1.0


In [47]:
scores.cov()

Unnamed: 0,Abi,Ajay,Aman,Joy
Abi,13.8,21.1,21.4,11.1
Ajay,21.1,61.7,-9.2,37.95
Aman,21.4,-9.2,153.2,-10.7
Joy,11.1,37.95,-10.7,31.7


In [48]:
scores.T

Unnamed: 0,Maths,Phy,Chem,Bio,Hist
Abi,99,89,96,93,95
Ajay,95,80,99,98,97
Aman,67,45,39,45,35
Joy,33,26,41,37,32


In [49]:
scores.T.corr()

Unnamed: 0,Maths,Phy,Chem,Bio,Hist
Maths,1.0,0.976684,0.874993,0.925831,0.902484
Phy,0.976684,1.0,0.944962,0.968869,0.962222
Chem,0.874993,0.944962,1.0,0.99146,0.998188
Bio,0.925831,0.968869,0.99146,1.0,0.996717
Hist,0.902484,0.962222,0.998188,0.996717,1.0


In [50]:
scores.describe()

Unnamed: 0,Abi,Ajay,Aman,Joy
count,5.0,5.0,5.0,5.0
mean,94.4,93.8,46.2,33.8
std,3.714835,7.854935,12.377399,5.630275
min,89.0,80.0,35.0,26.0
25%,93.0,95.0,39.0,32.0
50%,95.0,97.0,45.0,33.0
75%,96.0,98.0,45.0,37.0
max,99.0,99.0,67.0,41.0


In [51]:
scores.idxmax()

Abi     Maths
Ajay     Chem
Aman    Maths
Joy      Chem
dtype: object

In [52]:
scores.idxmin()

Abi      Phy
Ajay     Phy
Aman    Hist
Joy      Phy
dtype: object