### Pandas: Python Data Analysis Library

## Installing

Choose one of the following commands:

`conda install pandas`

`pip install pandas`

In [14]:
import numpy as np
import pandas as pd

## Series

In [18]:
grades = [8.5, 7, 9]
print(np.array(grades)[0])
print(pd.Series(grades)[0])

8.5
8.5


In [21]:
grades = [8.5, 7, 9]
students = ['Student A','Student B','Student C']
serie_X = pd.Series(grades, students)
print(serie_X['Student B'])

7.0


In [22]:
np_grades = np.array([6, 7, 8])
pd.Series(np_grades)

0    6
1    7
2    8
dtype: int64

#### Exercise

(a) Create a series from a Python dictionary;

In [24]:
#series_a = pd.Series([5 ,2, 2], ['Brazil','Argentina','France'])
series_a = pd.Series({ 'Brazil': 5, 'Argentina': 2, 'France': 2 })
print("Brazil has {} World Cups.".format(series_a['Brazil']))

Brazil has 5 World Cups.


In [25]:
series_a['Brazil', 'Argentina'] # Error

KeyError: ('Brazil', 'Argentina')

In [26]:
print(series_a[['Brazil', 'Argentina']])
print()
print(type(series_a[['Brazil', 'Argentina']]))
print()
print(type(series_a['Brazil']))
print()
print(type(series_a[['Brazil']]))
print()
print(series_a[['Brazil']])

Brazil       5
Argentina    2
dtype: int64

<class 'pandas.core.series.Series'>

<class 'numpy.int64'>

<class 'pandas.core.series.Series'>

Brazil    5
dtype: int64


In [27]:
series_b = pd.Series([2018, 2002, 1986], index=['France','Brazil','Argentina'])
print("Last time France won World Cup was in {}, while Brazil won in {} and Argentina in {}".format(series_b['France'],series_b['Brazil'],series_b['Argentina']))

Last time France won World Cup was in 2018, while Brazil won in 2002 and Argentina in 1986


### Operations

In [28]:
series_c = pd.Series(data=[5, 10, 9], index=['AL01', 'AL03', 'AL02'])
series_d = pd.Series(data=[4, 6, 4], index=['AL01', 'AL02', 'AL03'])
sum_c_d = series_c + series_d
sum_c_d

AL01     9
AL02    15
AL03    14
dtype: int64

In [29]:
diff_c_d = series_c - series_d
diff_c_d

AL01    1
AL02    3
AL03    6
dtype: int64

In [30]:
series_e = pd.Series(data=[5, 9, 10, 5], index=['AL01', 'AL02', 'AL03', 'AL04'])
series_f = pd.Series(data=[4, 6, 4], index=['AL01', 'AL02', 'AL03'])
sum_e_f = series_e + series_f
sum_e_f

AL01     9.0
AL02    15.0
AL03    14.0
AL04     NaN
dtype: float64

### Methods - https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.html

In [31]:
series_e

AL01     5
AL02     9
AL03    10
AL04     5
dtype: int64

In [32]:
series_e.max()

10

In [33]:
series_e.min()

5

In [34]:
series_e.describe()

count     4.000000
mean      7.250000
std       2.629956
min       5.000000
25%       5.000000
50%       7.000000
75%       9.250000
max      10.000000
dtype: float64

In [35]:
series_e.cummax()

AL01     5
AL02     9
AL03    10
AL04    10
dtype: int64

In [36]:
series_e.cumprod()

AL01       5
AL02      45
AL03     450
AL04    2250
dtype: int64

In [37]:
series_e.diff()

AL01    NaN
AL02    4.0
AL03    1.0
AL04   -5.0
dtype: float64

## DataFrames

In [38]:
data = np.random.rand(5, 4) * 10
index = ['ST01','ST02','ST03','ST04','ST05']
col = ['T1', 'T2', 'T3', '4T']
df = pd.DataFrame(data, index, col)
df

Unnamed: 0,T1,T2,T3,4T
ST01,2.382316,1.666917,0.104856,1.768302
ST02,9.321659,5.175905,1.348825,6.107216
ST03,0.75435,9.335783,4.674561,8.470765
ST04,2.111785,1.058636,7.005753,5.413945
ST05,1.574047,5.522036,9.017342,7.845469


In [39]:
df['T1']

ST01    2.382316
ST02    9.321659
ST03    0.754350
ST04    2.111785
ST05    1.574047
Name: T1, dtype: float64

In [40]:
df.T1 # Not recommended

ST01    2.382316
ST02    9.321659
ST03    0.754350
ST04    2.111785
ST05    1.574047
Name: T1, dtype: float64

In [41]:
df['4T']

ST01    1.768302
ST02    6.107216
ST03    8.470765
ST04    5.413945
ST05    7.845469
Name: 4T, dtype: float64

In [42]:
df.4T

SyntaxError: invalid syntax (<ipython-input-42-db165df1bcaa>, line 1)

In [43]:
type(df['T1'])

pandas.core.series.Series

In [44]:
df[['T1', 'T2']]

Unnamed: 0,T1,T2
ST01,2.382316,1.666917
ST02,9.321659,5.175905
ST03,0.75435,9.335783
ST04,2.111785,1.058636
ST05,1.574047,5.522036


In [45]:
df['T4'] = np.random.rand(5) * 10
df

Unnamed: 0,T1,T2,T3,4T,T4
ST01,2.382316,1.666917,0.104856,1.768302,2.80882
ST02,9.321659,5.175905,1.348825,6.107216,5.508786
ST03,0.75435,9.335783,4.674561,8.470765,6.234865
ST04,2.111785,1.058636,7.005753,5.413945,7.238695
ST05,1.574047,5.522036,9.017342,7.845469,3.786302


In [46]:
type(df['T4'])

pandas.core.series.Series

In [47]:
df['avg'] = (df['T1'] + df['T2'] + df['T3'] + df['T4'])/4
df

Unnamed: 0,T1,T2,T3,4T,T4,avg
ST01,2.382316,1.666917,0.104856,1.768302,2.80882,1.740727
ST02,9.321659,5.175905,1.348825,6.107216,5.508786,5.338794
ST03,0.75435,9.335783,4.674561,8.470765,6.234865,5.24989
ST04,2.111785,1.058636,7.005753,5.413945,7.238695,4.353717
ST05,1.574047,5.522036,9.017342,7.845469,3.786302,4.974932


In [48]:
df['another_avg'] = df.mean(1)
df

Unnamed: 0,T1,T2,T3,4T,T4,avg,another_avg
ST01,2.382316,1.666917,0.104856,1.768302,2.80882,1.740727,1.745323
ST02,9.321659,5.175905,1.348825,6.107216,5.508786,5.338794,5.466864
ST03,0.75435,9.335783,4.674561,8.470765,6.234865,5.24989,5.786702
ST04,2.111785,1.058636,7.005753,5.413945,7.238695,4.353717,4.530422
ST05,1.574047,5.522036,9.017342,7.845469,3.786302,4.974932,5.453354


In [49]:
df = df.drop(['avg', 'another_avg', '4T'], axis=1)
df

Unnamed: 0,T1,T2,T3,T4
ST01,2.382316,1.666917,0.104856,2.80882
ST02,9.321659,5.175905,1.348825,5.508786
ST03,0.75435,9.335783,4.674561,6.234865
ST04,2.111785,1.058636,7.005753,7.238695
ST05,1.574047,5.522036,9.017342,3.786302


### Working with DataFrames

In [54]:
st02 = df.loc['ST02']
st02

T1    9.321659
T2    5.175905
T3    1.348825
T4    5.508786
Name: ST02, dtype: float64

In [55]:
type(st02)

pandas.core.series.Series

In [52]:
df.iloc[1]

T1    9.321659
T2    5.175905
T3    1.348825
T4    5.508786
Name: ST02, dtype: float64

In [56]:
df.loc['ST02']['T1']

9.32165856379772

In [57]:
df.iloc[1]['T1']

9.32165856379772

In [58]:
df.loc[['ST02', 'ST03']][['T1', 'T2']]

Unnamed: 0,T1,T2
ST02,9.321659,5.175905
ST03,0.75435,9.335783


In [59]:
df.loc[['ST02', 'ST03']]

Unnamed: 0,T1,T2,T3,T4
ST02,9.321659,5.175905,1.348825,5.508786
ST03,0.75435,9.335783,4.674561,6.234865


In [60]:
df.iloc[[1, 4]][['T1', 'T2']]

Unnamed: 0,T1,T2
ST02,9.321659,5.175905
ST05,1.574047,5.522036


In [61]:
df.iloc[[1, 4], [1, 2]]

Unnamed: 0,T2,T3
ST02,5.175905,1.348825
ST05,5.522036,9.017342


### Conditional selection

In [62]:
df

Unnamed: 0,T1,T2,T3,T4
ST01,2.382316,1.666917,0.104856,2.80882
ST02,9.321659,5.175905,1.348825,5.508786
ST03,0.75435,9.335783,4.674561,6.234865
ST04,2.111785,1.058636,7.005753,7.238695
ST05,1.574047,5.522036,9.017342,3.786302


In [63]:
df[df > 3]

Unnamed: 0,T1,T2,T3,T4
ST01,,,,
ST02,9.321659,5.175905,,5.508786
ST03,,9.335783,4.674561,6.234865
ST04,,,7.005753,7.238695
ST05,,5.522036,9.017342,3.786302


In [64]:
mean_T1 = df['T1'].mean()
mean_T1

3.228831314880872

In [65]:
df['T1'] > mean_T1

ST01    False
ST02     True
ST03    False
ST04    False
ST05    False
Name: T1, dtype: bool

In [66]:
df['T1']

ST01    2.382316
ST02    9.321659
ST03    0.754350
ST04    2.111785
ST05    1.574047
Name: T1, dtype: float64

In [67]:
df[df['T1'] > mean_T1]

Unnamed: 0,T1,T2,T3,T4
ST02,9.321659,5.175905,1.348825,5.508786


In [68]:
df[(df['T1'] > 5) and (df['T2'] > 5)]
df[(df['T1'] > 5) or (df['T2'] > 5)]

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [70]:
df[(df['T1'] > 2) & (df['T2'] > 2)]

Unnamed: 0,T1,T2,T3,T4
ST02,9.321659,5.175905,1.348825,5.508786


In [71]:
df[(df['T1'] > 5) | (df['T2'] > 5)]

Unnamed: 0,T1,T2,T3,T4
ST02,9.321659,5.175905,1.348825,5.508786
ST03,0.75435,9.335783,4.674561,6.234865
ST05,1.574047,5.522036,9.017342,3.786302


###  Reset de Index

In [72]:
df

Unnamed: 0,T1,T2,T3,T4
ST01,2.382316,1.666917,0.104856,2.80882
ST02,9.321659,5.175905,1.348825,5.508786
ST03,0.75435,9.335783,4.674561,6.234865
ST04,2.111785,1.058636,7.005753,7.238695
ST05,1.574047,5.522036,9.017342,3.786302


In [73]:
df = df.reset_index(drop=False)
print(df)

  index        T1        T2        T3        T4
0  ST01  2.382316  1.666917  0.104856  2.808820
1  ST02  9.321659  5.175905  1.348825  5.508786
2  ST03  0.754350  9.335783  4.674561  6.234865
3  ST04  2.111785  1.058636  7.005753  7.238695
4  ST05  1.574047  5.522036  9.017342  3.786302


In [74]:
other_students = ['ST06','ST07','ST08','ST09','ST10']
df['STs'] = other_students
df = df.set_index('STs', drop=True)
df

Unnamed: 0_level_0,index,T1,T2,T3,T4
STs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ST06,ST01,2.382316,1.666917,0.104856,2.80882
ST07,ST02,9.321659,5.175905,1.348825,5.508786
ST08,ST03,0.75435,9.335783,4.674561,6.234865
ST09,ST04,2.111785,1.058636,7.005753,7.238695
ST10,ST05,1.574047,5.522036,9.017342,3.786302


In [76]:
#
students = ['ST01','ST01','ST01','ST02','ST02','ST02']
tests =['T1','T2','T3','T1','T2','T3']
advanced_index = list(zip(students, tests))
print(advanced_index)
advanced_index = pd.MultiIndex.from_tuples(advanced_index)

grades = np.random.rand(6, 3)*10

courses = ['C1','C2','C3']

advanced_df = pd.DataFrame(grades, advanced_index, courses)
advanced_df

[('ST01', 'T1'), ('ST01', 'T2'), ('ST01', 'T3'), ('ST02', 'T1'), ('ST02', 'T2'), ('ST02', 'T3')]


Unnamed: 0,Unnamed: 1,C1,C2,C3
ST01,T1,6.225124,8.750791,9.579447
ST01,T2,5.006003,4.756724,5.411326
ST01,T3,2.093493,9.44195,4.945643
ST02,T1,1.493259,2.150988,2.953548
ST02,T2,9.209003,8.97117,2.001319
ST02,T3,6.947077,4.205548,4.361107


In [77]:
advanced_df.index.names = ['Students','Tests']
advanced_df

Unnamed: 0_level_0,Unnamed: 1_level_0,C1,C2,C3
Students,Tests,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ST01,T1,6.225124,8.750791,9.579447
ST01,T2,5.006003,4.756724,5.411326
ST01,T3,2.093493,9.44195,4.945643
ST02,T1,1.493259,2.150988,2.953548
ST02,T2,9.209003,8.97117,2.001319
ST02,T3,6.947077,4.205548,4.361107


In [78]:
grades_st_1 = advanced_df.loc['ST01']
grades_st_1

Unnamed: 0_level_0,C1,C2,C3
Tests,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
T1,6.225124,8.750791,9.579447
T2,5.006003,4.756724,5.411326
T3,2.093493,9.44195,4.945643


In [79]:
type(grades_st_1)

pandas.core.frame.DataFrame

In [81]:
print(grades_st_1.loc['T1'])
type(grades_st_1.loc['T1'])

C1    6.225124
C2    8.750791
C3    9.579447
Name: T1, dtype: float64


pandas.core.series.Series

In [82]:
grades_st_1_t_2 = advanced_df.loc['ST01'].loc['T2']
grades_st_1_t_2

C1    5.006003
C2    4.756724
C3    5.411326
Name: T2, dtype: float64

In [83]:
type(grades_st_1_t_2)

pandas.core.series.Series

In [84]:
advanced_df.xs('ST01')

Unnamed: 0_level_0,C1,C2,C3
Tests,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
T1,6.225124,8.750791,9.579447
T2,5.006003,4.756724,5.411326
T3,2.093493,9.44195,4.945643


In [85]:
advanced_df.xs('T1', level='Tests')

Unnamed: 0_level_0,C1,C2,C3
Students,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ST01,6.225124,8.750791,9.579447
ST02,1.493259,2.150988,2.953548


### Data cleaning

In [86]:
dict_a = {'X':[1, np.nan, np.nan],'Y':[2, 4, np.nan],'Z':[3, 4, 4],}
df3 = pd.DataFrame(dict_a)
df3

Unnamed: 0,X,Y,Z
0,1.0,2.0,3
1,,4.0,4
2,,,4


In [87]:
df3.dropna()

Unnamed: 0,X,Y,Z
0,1.0,2.0,3


In [88]:
df3.dropna(axis=1)

Unnamed: 0,Z
0,3
1,4
2,4


In [89]:
df3.dropna(thresh=2)

Unnamed: 0,X,Y,Z
0,1.0,2.0,3
1,,4.0,4


In [90]:
df3.fillna(3)

Unnamed: 0,X,Y,Z
0,1.0,2.0,3
1,3.0,4.0,4
2,3.0,3.0,4


In [91]:
df3.fillna(pd.Series({'X':1,'Y':2,'Z':3}))

Unnamed: 0,X,Y,Z
0,1.0,2.0,3
1,1.0,4.0,4
2,1.0,2.0,4


In [92]:
df3['Y'].fillna(df3['Y'].mean())

0    2.0
1    4.0
2    3.0
Name: Y, dtype: float64

In [93]:
df3.fillna(df3.mean())

Unnamed: 0,X,Y,Z
0,1.0,2.0,3
1,1.0,4.0,4
2,1.0,3.0,4


### Grouping

In [94]:
dict_b = {'Method':['KNN', 'SVR', 'ANN', 'KNN', 'SVR', 'ANN'],
        'Scenario':['S1', 'S2', 'S1', 'S2', 'S1', 'S2'],
        'Error':[2.1, 2.1, 1.5, 1.7, 1.8, 1.9]}
df4 = pd.DataFrame(dict_b)
df4

Unnamed: 0,Method,Scenario,Error
0,KNN,S1,2.1
1,SVR,S2,2.1
2,ANN,S1,1.5
3,KNN,S2,1.7
4,SVR,S1,1.8
5,ANN,S2,1.9


In [95]:
byScenario = df4.groupby('Scenario')
print(byScenario)

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x114435780>


In [96]:
byScenario.mean()

Unnamed: 0_level_0,Error
Scenario,Unnamed: 1_level_1
S1,1.8
S2,1.9


In [97]:
byScenario.sum()

Unnamed: 0_level_0,Error
Scenario,Unnamed: 1_level_1
S1,5.4
S2,5.7


In [98]:
byScenario.min()

Unnamed: 0_level_0,Method,Error
Scenario,Unnamed: 1_level_1,Unnamed: 2_level_1
S1,ANN,1.5
S2,ANN,1.7


In [13]:
byScenario.max()

Unnamed: 0_level_0,Method,Error
Scenario,Unnamed: 1_level_1,Unnamed: 2_level_1
S1,SVR,2.1
S2,SVR,2.1


### Joins

In [99]:
df_j1 = pd.DataFrame({'A':['A0', 'A1', 'A2'],'B':['B0', 'B1', 'B2']}, index=[0, 1, 2])
df_j1

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2


In [100]:
df_j2 = pd.DataFrame({'A':['A3', 'A4', 'A5'],'B':['B3', 'B4', 'B5']},index=[3, 4, 5])
df_j2

Unnamed: 0,A,B
3,A3,B3
4,A4,B4
5,A5,B5


In [101]:
pd.concat([df_j1, df_j2])

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4
5,A5,B5


In [102]:
pd.concat([df_j1, df_j2], axis=1)

Unnamed: 0,A,B,A.1,B.1
0,A0,B0,,
1,A1,B1,,
2,A2,B2,,
3,,,A3,B3
4,,,A4,B4
5,,,A5,B5


In [105]:
left = pd.DataFrame({'country':['01','02','03'],'name':['Brazil','France','Japan']})
right =  pd.DataFrame({'country':['01','02','03'],'continent':['SA','EU','AS']})
print(left)
print(right)
df5 = pd.merge(left, right, how='inner', on='country')
df5

  country    name
0      01  Brazil
1      02  France
2      03   Japan
  country continent
0      01        SA
1      02        EU
2      03        AS


Unnamed: 0,country,name,continent
0,1,Brazil,SA
1,2,France,EU
2,3,Japan,AS


In [106]:
left_2 = pd.DataFrame({'key1':['K0','K1'],
                     'key2':['K0','K0'],
                     'A':['A00','A10'],
                     'B':['B00','B10']})
right_2 = pd.DataFrame({'key1':['K0','K1'],
                     'key2':['K0','K1'],
                     'C':['C00','C11'],
                     'D':['D00','D11']})
print(left_2)
print(right_2)
df6 = pd.merge(left_2, right_2, how='inner', on=['key1','key2'])
df6

  key1 key2    A    B
0   K0   K0  A00  B00
1   K1   K0  A10  B10
  key1 key2    C    D
0   K0   K0  C00  D00
1   K1   K1  C11  D11


Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A00,B00,C00,D00


In [108]:
left_3 = pd.DataFrame({'key1':['K0','K1'],
                     'key2':['K0','K0'],
                     'A':['A00','A10'],
                     'B':['B00','B10']})
right_3 = pd.DataFrame({'key1':['K0','K1'],
                     'key2':['K0','K1'],
                     'C':['C00','C11'],
                     'D':['D00','D11']})
print(left_3)
print(right_3)
df7 = pd.merge(left_3, right_3, how='right', on=['key1','key2'])
df7

  key1 key2    A    B
0   K0   K0  A00  B00
1   K1   K0  A10  B10
  key1 key2    C    D
0   K0   K0  C00  D00
1   K1   K1  C11  D11


Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A00,B00,C00,D00
1,K1,K1,,,C11,D11


In [109]:
df8 = pd.merge(left_3, right_3, how='outer', on=['key1','key2'])
df8

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A00,B00,C00,D00
1,K1,K0,A10,B10,,
2,K1,K1,,,C11,D11


In [113]:
left_4 = pd.DataFrame({'A':['A0', 'A1'], 'B':['B0', 'B1']}, index=['k0', 'k1'])
right_4 = pd.DataFrame({'C':['C0', 'C1'], 'D':['D0', 'D1']}, index=['k0', 'k2'])
print(left_4)
print(right_4)
left_4.join(right_4)

     A   B
k0  A0  B0
k1  A1  B1
     C   D
k0  C0  D0
k2  C1  D1


Unnamed: 0,A,B,C,D
k0,A0,B0,C0,D0
k1,A1,B1,,


In [114]:
left_4.join(right_4, how='inner')

Unnamed: 0,A,B,C,D
k0,A0,B0,C0,D0


### Methods - https://pandas.pydata.org/pandas-docs/stable/reference/frame.html

In [115]:
final_df = pd.DataFrame({'A':[1, 2, 3, 4],'B':[222, 555, 244, 222],'C':['ab', 'cd', 'ef', 'gh']})
final_df

Unnamed: 0,A,B,C
0,1,222,ab
1,2,555,cd
2,3,244,ef
3,4,222,gh


In [116]:
final_df['B'].unique()

array([222, 555, 244])

In [117]:
print(len(final_df['B'].unique()), final_df['B'].nunique())

3 3


In [118]:
print(df7, '\n')
print(len(df7['B'].unique()), df7['B'].nunique())

  key1 key2    A    B    C    D
0   K0   K0  A00  B00  C00  D00
1   K1   K1  NaN  NaN  C11  D11 

2 1


In [119]:
final_df['B'].value_counts()

222    2
555    1
244    1
Name: B, dtype: int64

In [120]:
def pot(x,n=2):
    return x**n

In [121]:
final_df['A'].apply(pot)

0     1
1     4
2     9
3    16
Name: A, dtype: int64

In [122]:
final_df['C'].apply(len)

0    2
1    2
2    2
3    2
Name: C, dtype: int64

In [123]:
final_df['A'].apply(lambda x:x**2)

0     1
1     4
2     9
3    16
Name: A, dtype: int64

In [124]:
final_df.sort_values('B')

Unnamed: 0,A,B,C
0,1,222,ab
3,4,222,gh
2,3,244,ef
1,2,555,cd
