In [2]:
import pandas as pd
import numpy as np

In [9]:
frame_3 = pd.DataFrame([[1.4, np.nan], 
                        [7.1, -4.5], 
                        [np.nan, np.nan], 
                        [0.75, -1.3]], 
                       index = ['a', 'b', 'c', 'd'], columns = ['one', 'two'])
frame_3

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [10]:
frame_3.sum()

one    9.25
two   -5.80
dtype: float64

In [11]:
frame_3.sum(axis = 1)

a    1.40
b    2.60
c     NaN
d   -0.55
dtype: float64

In [12]:
frame_3.mean(axis = 1, skipna = False)  # (1.4 + NaN) / 2 = NaN

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [13]:
frame_3.mean(axis = 1)  # 1.4 / 1 = 1.4

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [14]:
frame_3.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [15]:
frame_3.describe()  # incredibly useful in ProbStat

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [16]:
obj_4 = pd.Series(['a', 'a', 'c', 'd'] * 4)
obj_4

0     a
1     a
2     c
3     d
4     a
5     a
6     c
7     d
8     a
9     a
10    c
11    d
12    a
13    a
14    c
15    d
dtype: object

In [17]:
obj_4.describe()  # describle for non-numeric datas

count     16
unique     3
top        a
freq       8
dtype: object

### Correlation and Covariance

In [4]:
import pandas_datareader.data as web

all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'AMZN']:
    all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')

price = pd.DataFrame({tic: data['Adj Close'] for tic, data in all_data.items()})
volume = pd.DataFrame({tic: data['Volume'] for tic, data in all_data.items()})

In [5]:
returns = price.pct_change()
returns.tail()

Unnamed: 0_level_0,AAPL,AMZN,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2009-12-24,0.03434,-0.003383,0.004384,0.002587
2009-12-28,0.012294,0.006066,0.013326,0.005484
2009-12-29,-0.011861,0.000718,-0.003477,0.007058
2009-12-30,0.012147,-0.020945,0.005461,-0.013699
2009-12-31,-0.0043,-0.014433,-0.012597,-0.015504


In [6]:
returns.MSFT.corr(returns.IBM)  # calculate correlation of two specific columns

0.49598006790672183

In [7]:
returns.MSFT.cov(returns.IBM)  # calculate covariance of two specific columns

0.00021595768182070335

In [8]:
# corrwith

### Unique values, value counts, and Membership

In [18]:
obj = pd.Series(['c', 'd', 'c', 'a', 'b', 'd', 'c', 'a', 'a', 'd'])
uniques = obj.unique()
uniques

array(['c', 'd', 'a', 'b'], dtype=object)

In [20]:
uniques.sort()
uniques

array(['a', 'b', 'c', 'd'], dtype=object)

In [21]:
obj.value_counts()

c    3
d    3
a    3
b    1
dtype: int64

In [22]:
obj.isin(['b', 'c'])

0     True
1    False
2     True
3    False
4     True
5    False
6     True
7    False
8    False
9    False
dtype: bool

In [23]:
obj[obj.isin(['b', 'c'])]

0    c
2    c
4    b
6    c
dtype: object

In [25]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4], 
                     'Qu2': [2, 3, 1, 2, 3], 
                     'Qu3': [1 ,5, 2, 4, 4]})
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [27]:
result = data.apply(pd.value_counts).fillna(0)  # count values of every column
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0
