# Pandas Tutorial

In [88]:
import numpy as np
import pandas as pd

## Series

In [162]:
s_list_1 = ['a', 'b', 'c', 'd']
indexes = np.arange(1, 5)

In [90]:
#Creating series. It's something similar to hashmap in Java
s_ser_1 = pd.Series(data=s_list_1, index=indexes)
s_ser_1

1    a
2    b
3    c
4    d
dtype: object

In [91]:
#Creating array by np array without indexes. In this case indexes will be nums from 0 to arr.length()
arr = np.array([1, 2, 3, 4, 5])
s_ser_2 = pd.Series(arr)
s_ser_2

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [92]:
#Creating series from dictionary
dict_1 = {'f_name': "Vitalii", 'l_name': "Stinskii", 'age': 22}
s_ser_3 = pd.Series(dict_1)
s_ser_3

f_name     Vitalii
l_name    Stinskii
age             22
dtype: object

In [93]:
#Get data from series
s_ser_3['f_name']

'Vitalii'

In [94]:
#Basic math with series +-*/ . Math operation will be apply only on data, but not on indexes
s_ser_2 * s_ser_2

0     1
1     4
2     9
3    16
4    25
dtype: int64

In [95]:
#Using series with NumPy methods
np.exp(s_ser_2)

0      2.718282
1      7.389056
2     20.085537
3     54.598150
4    148.413159
dtype: float64

In [96]:
#The main difference between np arrays and pd series is that math operations apply by indexes. 
#In case when ser1 have index1 but ser2 doesn't have index1 the result of math operation will be NaN
s_ser_4 = pd.Series({4: 5, 5: 6, 6: 7, 7: 8})
print("s_ser_2:\n", s_ser_2)
s_ser_2 + s_ser_4

s_ser_2:
 0    1
1    2
2    3
3    4
4    5
dtype: int64


0     NaN
1     NaN
2     NaN
3     NaN
4    10.0
5     NaN
6     NaN
7     NaN
dtype: float64

In [97]:
#We can add a name to the series
s_ser_5 = pd.Series({4: 5, 5: 6, 6: 7, 7: 8}, name='rand_nums')
s_ser_5.name

'rand_nums'

## DataFrames

In [98]:
df_arr_1 = np.random.randint(10, 50, size=(2, 3))

In [99]:
#Creating dataFrame <array>, <row_names>, <column_names>
df_1 = pd.DataFrame(df_arr_1, ['A', 'B'], ['C', 'D', 'E'])
df_1

Unnamed: 0,C,D,E
A,15,47,45
B,36,19,43


In [100]:
#Creating df from dictionary. In case when we have disbalaced matrix in empty places will be NaN
df_dict_1 = {'one': pd.Series([1., 2., 3.,], index=['a','b','c']),
            'two': pd.Series([1., 2., 3., 4.], index=['a','b','c','d'])}
df_2 = pd.DataFrame(df_dict_1)
df_2

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [101]:
#Creating df from dict that created in the same method
pd.DataFrame.from_dict(dict([('A', [1,2,3]), ('B', [4,5,6])]))

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [102]:
#Creating df from dict that created in the same method. Assign keys as row and column labels
pd.DataFrame.from_dict(dict([('A', [1,2,3]), ('B', [4,5,6])]),
                      orient='index', columns=['one','two','three'])

Unnamed: 0,one,two,three
A,1,2,3
B,4,5,6


In [103]:
#Get md length of df
np.shape(df_2)

(4, 2)

## Editing & Retrieving Data

In [104]:
#Get the column
print("df_1:\n", df_1)
df_1['C']

df_1:
     C   D   E
A  15  47  45
B  36  19  43


A    15
B    36
Name: C, dtype: int64

In [105]:
#Get the columns
print("df_1:\n", df_1)
df_1[['C','D']]

df_1:
     C   D   E
A  15  47  45
B  36  19  43


Unnamed: 0,C,D
A,15,47
B,36,19


In [106]:
#Get row as a series
print("df_1:\n", df_1)
df_1.loc['A']

df_1:
     C   D   E
A  15  47  45
B  36  19  43


C    15
D    47
E    45
Name: A, dtype: int64

In [107]:
#Grab row by index position
print("df_1:\n", df_1)
df_1.iloc[1]

df_1:
     C   D   E
A  15  47  45
B  36  19  43


C    36
D    19
E    43
Name: B, dtype: int64

In [108]:
#Grab element (cell) by a row and a column
print("df_1:\n", df_1)
df_1.loc['A','C']

df_1:
     C   D   E
A  15  47  45
B  36  19  43


15

In [109]:
#Get the spesific rows and columns
print("df_1:\n", df_1)
df_1.loc[['A','B'],['C','D']]

df_1:
     C   D   E
A  15  47  45
B  36  19  43


Unnamed: 0,C,D
A,15,47
B,36,19


In [110]:
#Add new column like sum of all the columns
df_1['Total'] = df_1['C'] + df_1['D'] + df_1['E']
df_1

Unnamed: 0,C,D,E,Total
A,15,47,45,107
B,36,19,43,98


In [111]:
#Add new column. Using multiplication to create new column
df_2['mult'] = df_2['one'] + df_2['two']
df_2

Unnamed: 0,one,two,mult
a,1.0,1.0,2.0
b,2.0,2.0,4.0
c,3.0,3.0,6.0
d,,4.0,


In [112]:
#Adding new row to df. Since 2.0 pandas version the append() method has been removed from DataFrame [?]
print("df_1:\n", df_1)
er_dict_1 = {'C': 44, 'D': 55, 'E': 46}
df_1 = pd.concat([df_1, pd.DataFrame(er_dict_1, index=['F'])])
df_1

df_1:
     C   D   E  Total
A  15  47  45    107
B  36  19  43     98


Unnamed: 0,C,D,E,Total
A,15,47,45,107.0
B,36,19,43,98.0
F,44,55,46,


In [113]:
#Adding new row to df using .loc
print("df_1:\n", df_1)
er_dict_1 = {'C': 1, 'D': 1, 'E': 1}
df_1.loc['J'] = er_dict_1
df_1

df_1:
     C   D   E  Total
A  15  47  45  107.0
B  36  19  43   98.0
F  44  55  46    NaN


Unnamed: 0,C,D,E,Total
A,15,47,45,107.0
B,36,19,43,98.0
F,44,55,46,
J,1,1,1,


In [114]:
#Deleting a column <name> <dm> (0-rows 1 - columns) <inplace> 
#If inplace=False the row/column won't be deleted
print("df_1:\n", df_1)
df_1.drop('Total', axis=1, inplace=True)
df_1

df_1:
     C   D   E  Total
A  15  47  45  107.0
B  36  19  43   98.0
F  44  55  46    NaN
J   1   1   1    NaN


Unnamed: 0,C,D,E
A,15,47,45
B,36,19,43
F,44,55,46
J,1,1,1


In [124]:
#Deleting a row 
print("df_1:\n", df_1)
df_1.drop('J', axis=0, inplace=True)
df_1.drop('F', axis=0, inplace=True)
df_1

df_1:
     C   D   E
A  15  47  45
B  36  19  43
F  44  55  46


Unnamed: 0,C,D,E
A,15,47,45
B,36,19,43


In [125]:
#Creating a column and setting it as index row
df_1['S'] = ['Men','Woman']
df_1.set_index('S', inplace=True)
df_1

Unnamed: 0_level_0,C,D,E
S,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Men,15,47,45
Woman,36,19,43


In [117]:
#Reset df indexe. Makes previous index column as common column
#df_1.reset_index(inplace=True)
df_1

Unnamed: 0,C,D,E
A,15,47,45
B,36,19,43
F,44,55,46


In [118]:
#Creating row by pathing any func. For example basic math
print('df_2', df_2) 
df_2.assign(sum_pow_2=(df_2['one'] + df_2['two'])**2)

df_2    one  two  mult
a  1.0  1.0   2.0
b  2.0  2.0   4.0
c  3.0  3.0   6.0
d  NaN  4.0   NaN


Unnamed: 0,one,two,mult,sum_pow_2
a,1.0,1.0,2.0,4.0
b,2.0,2.0,4.0,16.0
c,3.0,3.0,6.0,36.0
d,,4.0,,


In [119]:
#Creating row by pathing any func. For example basic math
print('df_2', df_2) 
df_2.assign(sum_pow_2=lambda x: (x['one'] * x['two']))

df_2    one  two  mult
a  1.0  1.0   2.0
b  2.0  2.0   4.0
c  3.0  3.0   6.0
d  NaN  4.0   NaN


Unnamed: 0,one,two,mult,sum_pow_2
a,1.0,1.0,2.0,1.0
b,2.0,2.0,4.0,4.0
c,3.0,3.0,6.0,9.0
d,,4.0,,


In [120]:
#Combining 2 df. If there are empy elements (nan) in df1 by column index A 
# and df2 has not empty elements by the same column and rows indexs
# elements from the df2 fill the empty elems in df1
er_df_1 = pd.DataFrame({'A': [1., np.nan, 3., np.nan]})
er_df_2 = pd.DataFrame({'A': [8., 9., 2., 4.]})
er_df_1.combine_first(er_df_2)

Unnamed: 0,A
0,1.0
1,9.0
2,3.0
3,4.0


## Conditional Selection

In [155]:
cs_arr_1 = np.random.randint(10, 50, size=(2,3))
cs_df_1 = pd.DataFrame(cs_arr_1, ['A','B'], ['C','D','E'])
cs_arr_2 = np.array([[1,2,3], [4,5,6], [7,8,9]])
cs_ds_2 = pd.DataFrame(cs_arr_2, ['A', 'B', 'C'], ['X', 'Y', 'Z'])
cs_df_1

Unnamed: 0,C,D,E
A,48,47,10
B,33,39,12


In [130]:
#Get the df of bool according to condition greater than 40
print("Greater than 40:\n", cs_df_1 > 40)

Greater than 40:
        C      D      E
A  False   True  False
B   True  False  False


In [136]:
#Get the df of bool according to condition greater than 40 in another way
# eq - ==, ne - !=, le - <=, lt - <, ge - >=, gt - >
print("Greater than 40:\n", cs_df_1.gt(40))

Greater than 40:
        C      D      E
A  False   True  False
B   True  False  False


In [142]:
#Get all the elements those applies to condition >= 30. In another case an element will be replaced with NaN
cs_bool_1 = cs_df_1 >= 30
cs_df_1[cs_bool_1]

Unnamed: 0,C,D,E
A,,48,31.0
B,49.0,32,


In [147]:
#The same thing but with columns
print("cs_df_1:\n", cs_df_1)
cs_df_1[cs_df_1['E'] > 30]

cs_df_1:
     C   D   E
A  27  48  31
B  49  32  29


Unnamed: 0,C,D,E
A,27,48,31


In [153]:
#Get elements by the condition and get the columns specific columns
print(cs_df_1[cs_df_1['E'] > 30][['D','E']])

    D   E
A  48  31


In [161]:
#Multiple conditions
print("cs_ds_2", cs_ds_2)
cs_ds_2[(cs_ds_2['X']>3) & (cs_ds_2['X']<7)]

cs_ds_2    X  Y  Z
A  1  2  3
B  4  5  6
C  7  8  9


Unnamed: 0,X,Y,Z
B,4,5,6
