# Python -05
## Pandas (pd.Series, pd.DataFrame)

In [1]:
import numpy as np # for processing data
import pandas as pd # package that based on numpy

### pandas Serises concept

In [2]:
s1 = pd.Series((1,2,3,4,5))
print(s1)

0    1
1    2
2    3
3    4
4    5
dtype: int64


In [3]:
s1.values # the value that in the series

array([1, 2, 3, 4, 5])

In [4]:
s1.index

RangeIndex(start=0, stop=5, step=1)

### indexing

In [5]:
s1[3] #[] is the index of value

4

In [6]:
s2 = pd.Series([1,2,3,4,5], index = ['a','b','c','d','e'])
print(s2)

a    1
b    2
c    3
d    4
e    5
dtype: int64


In [7]:
s2.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

Difference between list and series:
list        [1,2,3,4,5]           default index without being shown 0,1,2,3,4    being show index: None
series a-1 b-2 c-3 d-4 e-5        default index without being shown 0,1,2,3,4    being show index: a,b,c,d,e

In [8]:
s1

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [11]:
s1.index = ['a','b','c','d','e']
s1

a    1
b    2
c    3
d    4
e    5
dtype: int64

can do maths in two series:

In [12]:
s1 + s2

a     2
b     4
c     6
d     8
e    10
dtype: int64

In [13]:
s3 = pd.Series([1,2,3,4])
s3

0    1
1    2
2    3
3    4
dtype: int64

s1 + s3

They do not have same index, pandas.Series calculation is based on both of the series having same index
NaN means cannot not be processed.

In [15]:
s3.index = ['a','b','c','d']
s4 = s1 + s3
print(s4)

a    2.0
b    4.0
c    6.0
d    8.0
e    NaN
dtype: float64


In [16]:
s4.isnull() # to check is it null

a    False
b    False
c    False
d    False
e     True
dtype: bool

In [17]:
s4.notnull()

a     True
b     True
c     True
d     True
e    False
dtype: bool

## pd.DataFrame

In [21]:
dat1 = {'A':[1,2,3,4],'B':[3,4,5,6],'C':[4,5,6,7],'D':[11,12,13,14]} # dictionary
df1 = pd.DataFrame(dat1)
print(df1)
df1

   A  B  C   D
0  1  3  4  11
1  2  4  5  12
2  3  5  6  13
3  4  6  7  14


Unnamed: 0,A,B,C,D
0,1,3,4,11
1,2,4,5,12
2,3,5,6,13
3,4,6,7,14


In [23]:
dat2 = {'A':1,'B':2,'C':3,'D':4}
df2 = pd.DataFrame(dat2)  #ValueError: If using all scalar values, you must pass an index
df2

ValueError: If using all scalar values, you must pass an index

In [24]:
dat2 = {'A':1,'B':2,'C':3,'D':4}
df2 = pd.DataFrame(dat2, index =[0]) #(data, index) 
df2

Unnamed: 0,A,B,C,D
0,1,2,3,4


In [25]:
dat4 = np.random.randn(10,4)
df4 = pd.DataFrame(np.random.randn(10,4),
                   columns =['A','B','C','D'], #column name
                   index =['2018-01-01','2018-01-02','2018-01-03','2018-01-04','2018-01-05','2018-01-06','2018-01-07','2018-01-08','2018-01-09','2018-01-10',])
df4

Unnamed: 0,A,B,C,D
2018-01-01,0.046047,-0.516866,0.040096,0.926392
2018-01-02,-0.1096,0.039434,0.028602,-0.31764
2018-01-03,-0.490317,-0.339115,-0.040352,-0.407119
2018-01-04,-0.624773,0.927213,0.797598,-0.013607
2018-01-05,-0.566581,0.870027,-0.46854,2.473787
2018-01-06,-0.248693,-0.310058,1.195899,0.281992
2018-01-07,0.776119,0.900421,2.485013,-0.253277
2018-01-08,-0.76106,0.432528,-0.044474,0.046174
2018-01-09,0.351313,-0.221889,0.542047,1.730276
2018-01-10,1.462668,-0.509779,-0.092478,1.386532


In [26]:
np.random.randn(3)

array([-1.56314395, -0.40522777, -0.04212024])

In [27]:
np.random.randn(3,4)

array([[-0.17606783, -1.04649652, -0.39592082,  0.44659354],
       [ 2.26670865,  0.66678065, -0.11452929, -1.07113859],
       [-0.24732362,  1.44775291, -0.20648138,  0.30083088]])

In [28]:
df4['A']

2018-01-01    0.046047
2018-01-02   -0.109600
2018-01-03   -0.490317
2018-01-04   -0.624773
2018-01-05   -0.566581
2018-01-06   -0.248693
2018-01-07    0.776119
2018-01-08   -0.761060
2018-01-09    0.351313
2018-01-10    1.462668
Name: A, dtype: float64

In [29]:
df4[['A','B']]

Unnamed: 0,A,B
2018-01-01,0.046047,-0.516866
2018-01-02,-0.1096,0.039434
2018-01-03,-0.490317,-0.339115
2018-01-04,-0.624773,0.927213
2018-01-05,-0.566581,0.870027
2018-01-06,-0.248693,-0.310058
2018-01-07,0.776119,0.900421
2018-01-08,-0.76106,0.432528
2018-01-09,0.351313,-0.221889
2018-01-10,1.462668,-0.509779


In [30]:
df4['A']['2018-01-01']

0.046047494971983696

In [31]:
df4[['A','B']]['2018-01-01':'2018-01-03'] # : means from one to end

Unnamed: 0,A,B
2018-01-01,0.046047,-0.516866
2018-01-02,-0.1096,0.039434
2018-01-03,-0.490317,-0.339115


In [33]:
df4[['A','B']]['2018-01-01':'2018-01-01'] 

Unnamed: 0,A,B
2018-01-01,0.046047,-0.516866


## loc & iloc

In [34]:
df4.loc[['2018-01-01'],['A','B']] #[[row],[col]]

Unnamed: 0,A,B
2018-01-01,0.046047,-0.516866


In [35]:
df4.iloc[[0],[0,1]] #[[row index],[col index]]

Unnamed: 0,A,B
2018-01-01,0.046047,-0.516866


In [37]:
df4.iloc[[0],[0,1]] = 0
df4

Unnamed: 0,A,B,C,D
2018-01-01,0.0,0.0,0.040096,0.926392
2018-01-02,-0.1096,0.039434,0.028602,-0.31764
2018-01-03,-0.490317,-0.339115,-0.040352,-0.407119
2018-01-04,-0.624773,0.927213,0.797598,-0.013607
2018-01-05,-0.566581,0.870027,-0.46854,2.473787
2018-01-06,-0.248693,-0.310058,1.195899,0.281992
2018-01-07,0.776119,0.900421,2.485013,-0.253277
2018-01-08,-0.76106,0.432528,-0.044474,0.046174
2018-01-09,0.351313,-0.221889,0.542047,1.730276
2018-01-10,1.462668,-0.509779,-0.092478,1.386532


In [38]:
df4['E']=3.33 #add a new data 
df4

Unnamed: 0,A,B,C,D,E
2018-01-01,0.0,0.0,0.040096,0.926392,3.33
2018-01-02,-0.1096,0.039434,0.028602,-0.31764,3.33
2018-01-03,-0.490317,-0.339115,-0.040352,-0.407119,3.33
2018-01-04,-0.624773,0.927213,0.797598,-0.013607,3.33
2018-01-05,-0.566581,0.870027,-0.46854,2.473787,3.33
2018-01-06,-0.248693,-0.310058,1.195899,0.281992,3.33
2018-01-07,0.776119,0.900421,2.485013,-0.253277,3.33
2018-01-08,-0.76106,0.432528,-0.044474,0.046174,3.33
2018-01-09,0.351313,-0.221889,0.542047,1.730276,3.33
2018-01-10,1.462668,-0.509779,-0.092478,1.386532,3.33


In [39]:
df4['F'] = df4['A'] + df4['B'] + df4['C']
df4

Unnamed: 0,A,B,C,D,E,F
2018-01-01,0.0,0.0,0.040096,0.926392,3.33,0.040096
2018-01-02,-0.1096,0.039434,0.028602,-0.31764,3.33,-0.041564
2018-01-03,-0.490317,-0.339115,-0.040352,-0.407119,3.33,-0.869784
2018-01-04,-0.624773,0.927213,0.797598,-0.013607,3.33,1.100039
2018-01-05,-0.566581,0.870027,-0.46854,2.473787,3.33,-0.165093
2018-01-06,-0.248693,-0.310058,1.195899,0.281992,3.33,0.637148
2018-01-07,0.776119,0.900421,2.485013,-0.253277,3.33,4.161553
2018-01-08,-0.76106,0.432528,-0.044474,0.046174,3.33,-0.373007
2018-01-09,0.351313,-0.221889,0.542047,1.730276,3.33,0.671471
2018-01-10,1.462668,-0.509779,-0.092478,1.386532,3.33,0.860412


## Exercise 1

In [41]:
s1 = pd.Series((1,2,3,4,5))
s1

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [53]:
s1.index = ['h','i','j','k','l']
s1.astype('float64') # change the dataType
s1['j'] = 8.0
s1['l'] = np.NaN
s1

h    1.0
i    2.0
j    8.0
k    4.0
l    NaN
dtype: float64

In [54]:
s1 = pd.DataFrame(s1)
s1

Unnamed: 0,0
h,1.0
i,2.0
j,8.0
k,4.0
l,


## Exercise 2

In [90]:
np.random.seed(15)
d1 = pd.DataFrame(np.random.randn(4,4),columns=['A','B','C','D'])
d1

Unnamed: 0,A,B,C,D
0,-0.312328,0.339285,-0.155909,-0.50179
1,0.235569,-1.763605,-1.095862,-1.087766
2,-0.30517,-0.473748,-0.200595,0.355197
3,0.689518,0.41059,-0.564978,0.599391


In [63]:
np.random.seed(108)
dat2 = np.random.randn(4,4)
d2 = pd.DataFrame(dat2,columns = ['A','B','C','D'])
d2

Unnamed: 0,A,B,C,D
0,-1.026905,0.221749,1.13039,1.146185
1,-0.592734,0.118784,-0.48443,-1.944913
2,0.092077,0.902169,1.314469,0.771102
3,-0.540147,-0.284115,-0.889331,0.404169


In [72]:
d2.index = [['18-01','18-02','18-03','18-04']]
d2

Unnamed: 0,A,B,C,D
18-01,-1.026905,0.221749,1.13039,1.146185
18-02,-0.592734,0.118784,-0.48443,-1.944913
18-03,0.092077,0.902169,1.314469,0.771102
18-04,-0.540147,-0.284115,-0.889331,0.404169


In [77]:
# method 1 
d2[['B','C']]['18-02':'18-03']

Unnamed: 0,B,C
18-02,0.118784,-0.48443
18-03,0.902169,1.314469


In [82]:
# method 2
d2.loc[['18-02','18-03'],['B','C']]

Unnamed: 0,B,C
18-02,0.118784,-0.48443
18-03,0.902169,1.314469


In [83]:
# method 3
d2.iloc[[1,2],[1,2]]

Unnamed: 0,B,C
18-02,0.118784,-0.48443
18-03,0.902169,1.314469


## Exercise 3

In [85]:
df = pd.DataFrame([[1,2,3,4],[5,6,7,8],[9,10,11,12],[13,14,15,16]],
                 columns = ['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
0,1,2,3,4
1,5,6,7,8
2,9,10,11,12
3,13,14,15,16


In [86]:
# slicing with condition
df[df['A']>3] 
# get the data which in col A is > 3

Unnamed: 0,A,B,C,D
1,5,6,7,8
2,9,10,11,12
3,13,14,15,16


In [87]:
df[df<10]

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0
2,9.0,,,
3,,,,


In [88]:
df[(df>3)&(df<10)] # data which is 3< x <10

Unnamed: 0,A,B,C,D
0,,,,4.0
1,5.0,6.0,7.0,8.0
2,9.0,,,
3,,,,


In [89]:
d1

Unnamed: 0,A,B,C,D
0,-0.312328,0.339285,-0.155909,-0.50179
1,0.235569,-1.763605,-1.095862,-1.087766
2,-0.30517,-0.473748,-0.200595,0.355197
3,0.689518,0.41059,-0.564978,0.599391


In [98]:
d1[(d1['D']>0) & (d1['B']<0)]

Unnamed: 0,A,B,C,D
2,-0.30517,-0.473748,-0.200595,0.355197
