# Series

In [1]:
import numpy as np
import pandas as pd

## create series from array

In [2]:
random_int_arr = np.random.randint(1,100,10)

In [3]:
pd.Series(random_int_arr)

0    66
1    22
2    35
3    17
4    75
5    26
6    44
7     4
8    95
9     3
dtype: int64

## create series from dictionary

In [5]:
score = {"batman": 100, "superman": 200, "ironman": 400, "spiderman": 500}

In [6]:
pd.Series(score)

batman       100
superman     200
ironman      400
spiderman    500
dtype: int64

## create series from array of values and index

In [11]:
values = [10, 20, 30, 40]
index = ['a', 'b', 'c', 'd']

In [12]:
pd.Series(values, index)

a    10
b    20
c    30
d    40
dtype: int64

## add two series

In [81]:
# this will add the values if the index is present in both the series, NaN otherwise.
s1 = pd.Series({"a": 1, "b": 2, "c": 3, "e": 8})
s2 = pd.Series({"a": 4, "b": 5, "c": 6, "d": 7})
s1 + s2

a    5.0
b    7.0
c    9.0
d    NaN
e    NaN
dtype: float64

## multiply two series

In [85]:
# this will multiply the values if the index is present in both the series, NaN otherwise.
s1 = pd.Series({"a": 1, "b": 2, "c": 3, "e": 8})
s2 = pd.Series({"a": 4, "b": 5, "c": 6, "d": 7})
s1 * s2

a     4.0
b    10.0
c    18.0
d     NaN
e     NaN
dtype: float64

## subtract two series

In [86]:
# this will subtract the values if the index is present in both the series, NaN otherwise.
s1 = pd.Series({"a": 1, "b": 2, "c": 3, "e": 8})
s2 = pd.Series({"a": 4, "b": 5, "c": 6, "d": 7})
s1 - s2

a   -3.0
b   -3.0
c   -3.0
d    NaN
e    NaN
dtype: float64

## divide two series

In [84]:
# this will divide the values if the index is present in both the series, NaN otherwise.
s1 = pd.Series({"a": 1, "b": 2, "c": 3, "e": 8})
s2 = pd.Series({"a": 4, "b": 5, "c": 6, "d": 7})
s1 / s2

a    0.25
b    0.40
c    0.50
d     NaN
e     NaN
dtype: float64

# DataFrames

In [19]:
# create a 2d array having 5 rows and 5 columns
data = np.random.randint(1, 100, 25).reshape(5, 5)
data

array([[15, 23, 57, 83, 63],
       [74, 87, 22, 81, 50],
       [86, 54, 51, 10, 69],
       [14, 34, 60, 11, 67],
       [85, 84, 74, 21, 20]])

In [21]:
index = ['player1', 'player2', 'player3', 'player4', 'player5'] # row labels
columns = ['I', 'II', 'III', 'IV', 'V'] # column labels

In [25]:
df = pd.DataFrame(data, index, columns)
df

Unnamed: 0,I,II,III,IV,V
player1,15,23,57,83,63
player2,74,87,22,81,50
player3,86,54,51,10,69
player4,14,34,60,11,67
player5,85,84,74,21,20


In [26]:
# get column
df['I']

player1    15
player2    74
player3    86
player4    14
player5    85
Name: I, dtype: int64

In [40]:
# get multiple columns
df[['I','II']]

Unnamed: 0,I,II
player1,15,23
player2,74,87
player3,86,54
player4,14,34
player5,85,84


In [41]:
# get row
df.loc['player1']

I      15
II     23
III    57
IV     83
V      63
Name: player1, dtype: int64

In [43]:
# get row by index
df.iloc[0]

I      15
II     23
III    57
IV     83
V      63
Name: player1, dtype: int64

In [44]:
# get multiple rows
df.loc[['player1','player2']]

Unnamed: 0,I,II,III,IV,V
player1,15,23,57,83,63
player2,74,87,22,81,50


In [45]:
# get subset of rows and columns
df.loc[['player1','player2'],['I','II']]

Unnamed: 0,I,II
player1,15,23
player2,74,87


## add/delete columns

In [37]:
df['VI'] = df['I'] + df['II']
df

Unnamed: 0,I,II,III,IV,V,VI
player1,15,23,57,83,63,38
player2,74,87,22,81,50,161
player3,86,54,51,10,69,140
player4,14,34,60,11,67,48
player5,85,84,74,21,20,169


In [38]:
# to remove a column set axis=1. default value of axis=0 which is for row.
df.drop('VI', axis=1)

Unnamed: 0,I,II,III,IV,V
player1,15,23,57,83,63
player2,74,87,22,81,50
player3,86,54,51,10,69
player4,14,34,60,11,67
player5,85,84,74,21,20


In [39]:
# by default dropped row/column is not removed from dataframe, hence we need to set inplace=True
df.drop('VI', axis=1, inplace=True)
df

Unnamed: 0,I,II,III,IV,V
player1,15,23,57,83,63
player2,74,87,22,81,50
player3,86,54,51,10,69
player4,14,34,60,11,67
player5,85,84,74,21,20


## boolean dataframe

In [46]:
# compare all the values of the data frame and return True if it is greater than 50
df > 50

Unnamed: 0,I,II,III,IV,V
player1,False,False,True,True,True
player2,True,True,False,True,False
player3,True,True,True,False,True
player4,False,False,True,False,True
player5,True,True,True,False,False


In [48]:
# for all the False value the data frame will have NaN
df[df > 50]

Unnamed: 0,I,II,III,IV,V
player1,,,57.0,83.0,63.0
player2,74.0,87.0,,81.0,
player3,86.0,54.0,51.0,,69.0
player4,,,60.0,,67.0
player5,85.0,84.0,74.0,,


## get all index (example: player1, player2, etc) of a given column (example: I, II, etc.) whose value is greater than 50

In [67]:
df

Unnamed: 0,I,II,III,IV,V
player1,15,23,57,83,63
player2,74,87,22,81,50
player3,86,54,51,10,69
player4,14,34,60,11,67
player5,85,84,74,21,20


In [68]:
# get boolean values for column I.
df['I'] > 50

player1    False
player2     True
player3     True
player4    False
player5     True
Name: I, dtype: bool

In [69]:
# select only the rows aka index (i.e. players) that have score greater than 50 for column I
df[df['I'] > 50]

Unnamed: 0,I,II,III,IV,V
player2,74,87,22,81,50
player3,86,54,51,10,69
player5,85,84,74,21,20


In [70]:
# now select only column I from the data frame
df[df['I'] > 50]['I']

player2    74
player3    86
player5    85
Name: I, dtype: int64

## get all index (example: player1, player2, etc) of a given columns (example: I, II, etc.) whose value is greater than 50

In [71]:
# our data frame
df

Unnamed: 0,I,II,III,IV,V
player1,15,23,57,83,63
player2,74,87,22,81,50
player3,86,54,51,10,69
player4,14,34,60,11,67
player5,85,84,74,21,20


In [72]:
bool_series_game_I = df['I'] > 50
bool_series_game_I

player1    False
player2     True
player3     True
player4    False
player5     True
Name: I, dtype: bool

In [74]:
bool_series_game_IV = df['IV'] > 50
bool_series_game_IV

player1     True
player2     True
player3    False
player4    False
player5    False
Name: IV, dtype: bool

In [75]:
# get boolean value for rows aka index (i.e., players in this example) having score > 50 for column I and IV
bool_series_game_I & bool_series_game_IV

player1    False
player2     True
player3    False
player4    False
player5    False
dtype: bool

In [76]:
# filter the rows
df[bool_series_game_I & bool_series_game_IV]

Unnamed: 0,I,II,III,IV,V
player2,74,87,22,81,50


In [79]:
# filter the columns
df[bool_series_game_I & bool_series_game_IV][['I','IV']]

Unnamed: 0,I,IV
player2,74,81


In [80]:
df[(df['I'] > 50) & (df['IV'] > 50)][['I','IV']]

Unnamed: 0,I,IV
player2,74,81


## Missing values

In [94]:
# create a data frame having 5 rows and 3 columns.
index = ['player1', 'player2', 'player3', 'player4', 'player5'] # row labels
columns = ['I', 'II', 'III'] # column labels
data = [
    [60, 70, 90],
    [np.nan, 20, np.nan],
    [40, 20, 30],
    [50, 20, 90],
    [60, 40, np.nan],
]
df = pd.DataFrame(np.array(data), index, columns)
df

Unnamed: 0,I,II,III
player1,60.0,70.0,90.0
player2,,20.0,
player3,40.0,20.0,30.0
player4,50.0,20.0,90.0
player5,60.0,40.0,


In [95]:
# drops all the rows having atleast 1 NaN value
df.dropna()

Unnamed: 0,I,II,III
player1,60.0,70.0,90.0
player3,40.0,20.0,30.0
player4,50.0,20.0,90.0


In [96]:
# drops all the columns having atleast 1 NaN value
df.dropna(axis=1)

Unnamed: 0,II
player1,70.0
player2,20.0
player3,20.0
player4,20.0
player5,40.0


In [99]:
# fill missing values with 0
df.fillna(value=0)

Unnamed: 0,I,II,III
player1,60.0,70.0,90.0
player2,0.0,20.0,0.0
player3,40.0,20.0,30.0
player4,50.0,20.0,90.0
player5,60.0,40.0,0.0


## GroupBy

In [101]:
# create a data frame having 5 rows and 2 columns
data = {
    "team": ['apple','mango','apple','mango','banana'],
    "score": [10, 50, 20, 90, 20]
}
df = pd.DataFrame(data)
df

Unnamed: 0,team,score
0,apple,10
1,mango,50
2,apple,20
3,mango,90
4,banana,20


In [102]:
groupby_team = df.groupby('team')
groupby_team

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fd082918bb0>

In [103]:
groupby_team.sum()

Unnamed: 0_level_0,score
team,Unnamed: 1_level_1
apple,30
banana,20
mango,140


In [107]:
groupby_team.mean()

Unnamed: 0_level_0,score
team,Unnamed: 1_level_1
apple,15.0
banana,20.0
mango,70.0


In [104]:
groupby_team.min()

Unnamed: 0_level_0,score
team,Unnamed: 1_level_1
apple,10
banana,20
mango,50


In [105]:
groupby_team.max()

Unnamed: 0_level_0,score
team,Unnamed: 1_level_1
apple,20
banana,20
mango,90


In [106]:
groupby_team.mean()

Unnamed: 0_level_0,score
team,Unnamed: 1_level_1
apple,15.0
banana,20.0
mango,70.0
