In [218]:
import pandas as pd
import numpy as np

# import warnings
# warnings.filterwarnings("ignore")  # to suppress all warnings

In [219]:
data = [1, 3, 5, 7, 9, 18]
data

[1, 3, 5, 7, 9, 18]

In [220]:
pd.DataFrame(data)

# Creating a DataFrame from a list (As no index and column names are specified, it gave numerical indexes by default)

Unnamed: 0,0
0,1
1,3
2,5
3,7
4,9
5,18


In [221]:
pd.DataFrame(data, columns=['column1'])

# Creating a DataFrame by giving a column name from the list 
# (As many columns as there are, as many column names should be entered.)

Unnamed: 0,column1
0,1
1,3
2,5
3,7
4,9
5,18


In [222]:
# Let us remember how we define the name of a Series

pd.Series(data=data, name="column_1")

0     1
1     3
2     5
3     7
4     9
5    18
Name: column_1, dtype: int64

In [223]:
pd.DataFrame(data=data, index=["A", "B", "C", "D", "E", "F"], columns=["Colmn1"]) 

# Creating a DataFrame by giving index and column names from the list 
# (We should enter as many index names as we have rows in dataframe)

Unnamed: 0,Colmn1
A,1
B,3
C,5
D,7
E,9
F,18


In [224]:
data = np.arange(1, 24, 2).reshape(3, 4)
data

array([[ 1,  3,  5,  7],
       [ 9, 11, 13, 15],
       [17, 19, 21, 23]])

In [225]:
df = pd.DataFrame(data=data, columns=['var1','var2','var3','var4'])
df

# Creating a DataFrame from an array by defining column names. 
# Numeric indexes are assigned by default when no index name is given.
# As many columns as there are, as many column names should be entered. If it is different, it will throw an error.

Unnamed: 0,var1,var2,var3,var4
0,1,3,5,7
1,9,11,13,15
2,17,19,21,23


In [226]:
s1 = np.random.randint(2, 10, size=4)
s2 = np.random.randint(3, 10, size=4)
s3 = np.random.randint(4, 15, size=4)

In [227]:
s1

array([7, 3, 6, 7])

In [228]:
s2

array([3, 6, 9, 3])

In [229]:
s3

array([11,  9, 11,  9])

In [230]:
# Creating a DataFrame from simple dictionary i.e dictionary with key and simple value like integer or string value.

myDict= {'var1':s1, 'var2':s2, 'var3':s3}

In [231]:
df = pd.DataFrame(myDict)
df

Unnamed: 0,var1,var2,var3
0,7,3,11
1,3,6,9
2,6,9,11
3,7,3,9


In [232]:
# pandas.head() function is used to access the first n rows of a dataframe or series.

df.head(2)

Unnamed: 0,var1,var2,var3
0,7,3,11
1,3,6,9


In [233]:
df.head(3)

Unnamed: 0,var1,var2,var3
0,7,3,11
1,3,6,9
2,6,9,11


In [234]:
# pandas.tail() function returns last n rows from the object based on position.

df.tail(2)

Unnamed: 0,var1,var2,var3
2,6,9,11
3,7,3,9


In [235]:
df.tail(1)

Unnamed: 0,var1,var2,var3
3,7,3,9


In [236]:
# Pandas sample() is used to generate a sample random row or column from the function caller dataframe.

df.sample(2)

Unnamed: 0,var1,var2,var3
2,6,9,11
1,3,6,9


In [237]:
df.sample(2)

Unnamed: 0,var1,var2,var3
0,7,3,11
1,3,6,9


In [238]:
# pandas.columns atribute returns the column labels of the DataFrame.

df.columns

Index(['var1', 'var2', 'var3'], dtype='object')

In [239]:
for i in df.columns:
    print(i)

var1
var2
var3


In [240]:
for i in df.columns:
    print(df[i].mean())  # We calculated the mean for each column using the column names.

5.75
5.25
10.0


In [241]:
df.mean()  # mean() method does the same job as above

var1     5.75
var2     5.25
var3    10.00
dtype: float64

In [242]:
# pandas.columns atribute returns the index (row labels) of the DataFrame.

df.index

RangeIndex(start=0, stop=4, step=1)

In [243]:
[i for i in df.index]

[0, 1, 2, 3]

In [244]:
df.columns = ['new1', 'new2', 'new3']
df

Unnamed: 0,new1,new2,new3
0,7,3,11
1,3,6,9
2,6,9,11
3,7,3,9


In [245]:
df.index = ["a", "b", "c", "d"]
df

Unnamed: 0,new1,new2,new3
a,7,3,11
b,3,6,9
c,6,9,11
d,7,3,9


In [246]:
# You can use the Pandas dataframe.rename() function to modify specific column names. 

df.rename(columns={"new1": "a", "new2": "b"})

Unnamed: 0,a,b,new3
a,7,3,11
b,3,6,9
c,6,9,11
d,7,3,9


In [247]:
# You can also modify index/columns names with df.rename() method. However, this change is not permanent. 
# The inplace parameter must be set to True for it to be permanent.

df.rename(index={"a": 1, "b": 2})

Unnamed: 0,new1,new2,new3
1,7,3,11
2,3,6,9
c,6,9,11
d,7,3,9


In [248]:
# So, be careful while using the rename method since didn't do permanent operation due to inplace=False as default.

df

Unnamed: 0,new1,new2,new3
a,7,3,11
b,3,6,9
c,6,9,11
d,7,3,9


In [249]:
df.shape

(4, 3)

In [250]:
df.shape[0]

4

In [251]:
df.shape[1]

3

In [252]:
# You can use the Python len() method to look at the number of rows, 
# but using the shape method is a more logical and professional method.

len(df)

4

In [253]:
df.ndim  # We used the ndim attribute to find the number of dimensions.

2

In [254]:
df.size  # We used the size attribute to find the total number of elements in the dataframe.

12

In [255]:
# Only the values in the DataFrame will be returned, the axes labels will be removed.

df.values

array([[ 7,  3, 11],
       [ 3,  6,  9],
       [ 6,  9, 11],
       [ 7,  3,  9]])

In [256]:
type(df)  # We checked the type of dataframe we created above

pandas.core.frame.DataFrame

In [257]:
type(df.values)  # We checked the type of values in the dataframe we created above

numpy.ndarray

In [258]:
type(df["new1"])  # We checked the type of a column/feature in the dataframe we created above

pandas.core.series.Series

In [259]:
# You can check any column name whether it belongs to the DataFrame or not

"new2" in df

True

In [260]:
'new5' in df

False

In [261]:
from numpy.random import randn

In [262]:
# Creating a DataFrame by "keyword arguments"

np.random.seed(101)

df = pd.DataFrame(randn(5, 4), index='A B C D E'.split(), columns='W X Y Z'.split())
df

# The numpy random seed is a numerical value that generates a new set or repeats pseudo-random numbers. 
# The value defined in the numpy random seed saves the state of randomness. 
# If we call the seed function using value 101 multiple times as in our case, the computer displays the same random numbers.

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [263]:
df['Y']

# In this way, you can access the values and indexes of the column you want.

A    0.907969
B   -0.848077
C    0.528813
D   -0.933237
E    2.605967
Name: Y, dtype: float64

In [264]:
# SQL Syntax (NOT RECOMMENDED!)

df.Y

A    0.907969
B   -0.848077
C    0.528813
D   -0.933237
E    2.605967
Name: Y, dtype: float64

In [265]:
type(df['Y'])

pandas.core.series.Series

In [266]:
# If you want to see the output of any column/feature as a DataFrame, you can use double bracet.

df[['Y']]

Unnamed: 0,Y
A,0.907969
B,-0.848077
C,0.528813
D,-0.933237
E,2.605967


In [267]:
type(df[['Y']])

pandas.core.frame.DataFrame

In [268]:
# df['Z','X'] # gives an error of KeyError: ('Z', 'X')

df[['Z','X']]

# To select more than one column, column names should be given in square brackets.

Unnamed: 0,Z,X
A,0.503826,0.628133
B,0.605965,-0.319318
C,-0.589001,0.740122
D,0.955057,-0.758872
E,0.683509,1.978757


In [269]:
# It searches index to find ["X" : "Y"] and gives an empty DataFrame because this way slicing works on rows.

df["X":"Z"] 

Unnamed: 0,W,X,Y,Z


In [270]:
df['B':'D']

# It slices rows defined. The last line (D) is included because we used labels, rather than numerical index, in the indexes.

Unnamed: 0,W,X,Y,Z
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


In [271]:
df[['W','Z']]

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


In [272]:
# df['C'] # gives error
# df['C','D'] # gives error

In [273]:
df["W"]["B"]

# df["W"] returns a pandas series which is why you can do positional indexing. 
# If you select column "W" as a dataframe this will throw an error.

0.6511179479432686

In [274]:
df["X"]["C"]

0.7401220570561068

The indexing operator alone like in df[ ] only works to select rows by boolean expressions - e.g. using a slice like df[:2] - or to select columns like in df["W"]. If you want to index by position, you need the **``.iloc``** attribute, like in df.iloc[0, :], which gives you the first row only, but all the columns.

In [275]:
df["A":"C"][["Y", "Z"]]

# df[["Y", "Z"]]["A":"C"]  # This gives the same result as well, displacement does not affect the result

Unnamed: 0,Y,Z
A,0.907969,0.503826
B,-0.848077,0.605965
C,0.528813,-0.589001


In [276]:
# Let's remember our df

df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [277]:
# feature engineering OR feature extraction

df['new1'] = df['X'] * df['Y']
df

Unnamed: 0,W,X,Y,Z,new1
A,2.70685,0.628133,0.907969,0.503826,0.570325
B,0.651118,-0.319318,-0.848077,0.605965,0.270806
C,-2.018168,0.740122,0.528813,-0.589001,0.391387
D,0.188695,-0.758872,-0.933237,0.955057,0.708208
E,0.190794,1.978757,2.605967,0.683509,5.156577


To create a new column, the column name is assigned. While assigning, some operations are performed to the values on the existing columns/features and the new values are used to fill the new column created.

In [278]:
df["new2"] = np.arange(5)
df

Unnamed: 0,W,X,Y,Z,new1,new2
A,2.70685,0.628133,0.907969,0.503826,0.570325,0
B,0.651118,-0.319318,-0.848077,0.605965,0.270806,1
C,-2.018168,0.740122,0.528813,-0.589001,0.391387,2
D,0.188695,-0.758872,-0.933237,0.955057,0.708208,3
E,0.190794,1.978757,2.605967,0.683509,5.156577,4


## <p style="background-color:#9d4f8c; font-family:newtimeroman; color:#FFF9ED; font-size:175%; text-align:center; border-radius:10px 10px;">Removing Columns</p>

<a id="5"></a>
<a href="#toc" class="btn btn-primary btn-sm" role="button" aria-pressed="true" 
style="color:blue; background-color:#dfa8e4" data-toggle="popover">Content</a>

**[DataFrame.drop(labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise')](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop.html)**

The **``drop()``** method removes the specified row or column. By specifying the column axis ( axis='columns' ), the drop() method removes the specified column. By specifying the row axis ( axis='index' ), the drop() method removes the specified row.

In [279]:
df.drop('new2', axis=1)

# We deleted a column ("new2") from DataFrame. The axis parameter must be set to 1; otherwise, the column will not be found. 
# The change is not permanent. To make it permanent, the inplace parameter must be set to True.

Unnamed: 0,W,X,Y,Z,new1
A,2.70685,0.628133,0.907969,0.503826,0.570325
B,0.651118,-0.319318,-0.848077,0.605965,0.270806
C,-2.018168,0.740122,0.528813,-0.589001,0.391387
D,0.188695,-0.758872,-0.933237,0.955057,0.708208
E,0.190794,1.978757,2.605967,0.683509,5.156577


In [280]:
df

Unnamed: 0,W,X,Y,Z,new1,new2
A,2.70685,0.628133,0.907969,0.503826,0.570325,0
B,0.651118,-0.319318,-0.848077,0.605965,0.270806,1
C,-2.018168,0.740122,0.528813,-0.589001,0.391387,2
D,0.188695,-0.758872,-0.933237,0.955057,0.708208,3
E,0.190794,1.978757,2.605967,0.683509,5.156577,4


In [281]:
df.drop(["new1", "new2"], axis=1)

# To delete more than one column from DataFrame, column names must be written as a list.

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [282]:
df

Unnamed: 0,W,X,Y,Z,new1,new2
A,2.70685,0.628133,0.907969,0.503826,0.570325,0
B,0.651118,-0.319318,-0.848077,0.605965,0.270806,1
C,-2.018168,0.740122,0.528813,-0.589001,0.391387,2
D,0.188695,-0.758872,-0.933237,0.955057,0.708208,3
E,0.190794,1.978757,2.605967,0.683509,5.156577,4


In [283]:
df.drop(columns=["new1", "new2"])

# We do not need to specify an axis when we give the column names as keyword arg.

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [284]:
# It will NOT be permanent, unless inplace=True specified!

df.drop(["new1", "new2"], axis=1, inplace=True)
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [285]:
df.drop('C', axis=0)

# We deleted a row ("C") from DataFrame. 
# Even if the axis parameter is be set to 0, the row(s) will be found and deleted since the default parameter is axis=0. 

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [286]:
df.drop(index=['B'])

# No need to specify axis when index parameter is defined

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [287]:
# the default value of axis is 0 (axis=0)

df_temp = df.drop('C', axis=0)
df_temp

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [288]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [289]:
data = np.random.randint(1, 40, size=(8, 4))

df = pd.DataFrame(data, columns=["var1", "var2", "var3", 'var4'])
df

Unnamed: 0,var1,var2,var3,var4
0,8,11,39,10
1,19,8,16,1
2,13,18,12,16
3,34,30,25,37
4,20,36,31,11
5,21,28,9,23
6,27,24,38,23
7,10,3,19,29


In [290]:
data2 = np.random.randint(1,30, size=(4,3))
df1 = pd.DataFrame(data2, columns = ["col1", "col2", "col3"])
df1

Unnamed: 0,col1,col2,col3
0,12,28,11
1,18,4,29
2,4,20,21
3,16,15,27


In [291]:
df.loc[4]  # Returns the observation at the 4th index --> used as loc[row, col]

var1    20
var2    36
var3    31
var4    11
Name: 4, dtype: int32

In [292]:
df.loc[[4]]  # df view is obtained with two square brackets

Unnamed: 0,var1,var2,var3,var4
4,20,36,31,11


In [293]:
# Slicing at hand returns the observations between 2 (inclusive) and 5 (inclusive) indexis. 
# The last index, 5, is INCLUSIVE at loc[]

df.loc[2:5]

Unnamed: 0,var1,var2,var3,var4
2,13,18,12,16
3,34,30,25,37
4,20,36,31,11
5,21,28,9,23


In [294]:
# Slicing at hand returns the observations between 2 (inclusive) and 5 (exclusive) indexis. 
# The last index, 5, is EXCLUSIVE at iloc[]

df.iloc[2:5]

Unnamed: 0,var1,var2,var3,var4
2,13,18,12,16
3,34,30,25,37
4,20,36,31,11


In [295]:
# We changed index labels. The length of the assigned values should be the same as the df's length.

df.index = 'a b c d e f g h'.split()
df

Unnamed: 0,var1,var2,var3,var4
a,8,11,39,10
b,19,8,16,1
c,13,18,12,16
d,34,30,25,37
e,20,36,31,11
f,21,28,9,23
g,27,24,38,23
h,10,3,19,29


In [296]:
# Slicing at hand returns the observations between 1 (inclusive) and 4 (exclusive) indexis. 
# The last index, 4, is EXCLUSIVE at iloc[]

df.iloc[1:4]

Unnamed: 0,var1,var2,var3,var4
b,19,8,16,1
c,13,18,12,16
d,34,30,25,37


In [297]:
# df.loc[1:4]  # gives error

In [298]:
# If the index values are str (labeled), you can reach the rows you want by using loc[] and slicing in this way.

df.loc['c':'g']

Unnamed: 0,var1,var2,var3,var4
c,13,18,12,16
d,34,30,25,37
e,20,36,31,11
f,21,28,9,23
g,27,24,38,23


In [299]:
df.loc['d','var3']  # Returns the value from the intersection of the column of "var3" and row "d"

25

In [300]:
df.iloc[3, 2]  # Returns the value from the intersection of 3rd row index and 2nd column index

25

In [301]:
df.loc['d':'g', 'var2']  # Returns the values in the "var2" column of rows "d" to "g"

d    30
e    36
f    28
g    24
Name: var2, dtype: int32

In [302]:
df.loc['d':'g']['var3']  # Returns the values in the "var3" column of rows "d" to "g"

d    25
e    31
f     9
g    38
Name: var3, dtype: int32

In [303]:
# How can we select these data as a DataFrame not a Series?
# First way

df.loc['d':'g'][['var3']]

Unnamed: 0,var3
d,25
e,31
f,9
g,38


In [304]:
# Second way

df.loc['d':'g', ["var3"]]

Unnamed: 0,var3
d,25
e,31
f,9
g,38


In [305]:
df.loc['d':'g'][["var2", "var3"]]  # Returns the values in both "var2" and "var3" columns of rows "d" to "g"

Unnamed: 0,var2,var3
d,30,25
e,36,31
f,28,9
g,24,38


In [306]:
df.iloc[2:5, 2]  # Returns the values in rows 2 through 5 of a column with an index value of 2

c    12
d    25
e    31
Name: var3, dtype: int32

In [307]:
df.iloc[2:5, [2]]  # Returns the values in rows 2 through 5 of a column with an index value of 2 as a DataFrame

Unnamed: 0,var3
c,12
d,25
e,31


In [308]:
# df.iloc[2:5][[2]]  # gives error

# It doesn't work that way. Because after selecting the rows, we exited iloc[]. We called the column later. 
# Instead, the following (we must now call the column by its name (label)) is used.

df.iloc[2:5][['var3']]

Unnamed: 0,var3
c,12
d,25
e,31


In [309]:
df.iloc[2:5][['var3']]

Unnamed: 0,var3
c,12
d,25
e,31


In [310]:
df

Unnamed: 0,var1,var2,var3,var4
a,8,11,39,10
b,19,8,16,1
c,13,18,12,16
d,34,30,25,37
e,20,36,31,11
f,21,28,9,23
g,27,24,38,23
h,10,3,19,29


In [311]:
df.loc['a', 'var1']

8

In [312]:
# let's select the same data as a DataFrame

df.loc[['a'], ['var1']]

Unnamed: 0,var1
a,8


In [313]:
# To print the output as a DataFrame, we must write the row and column labels in square brackets.

df.loc[['a', 'c'], ['var1', 'var3']]

Unnamed: 0,var1,var3
a,8,39
c,13,12


In [314]:
df.iloc[[0, 2], [0, 2]]

Unnamed: 0,var1,var3
a,8,39
c,13,12


In [315]:
# Let us remember our df

df

Unnamed: 0,var1,var2,var3,var4
a,8,11,39,10
b,19,8,16,1
c,13,18,12,16
d,34,30,25,37
e,20,36,31,11
f,21,28,9,23
g,27,24,38,23
h,10,3,19,29


In [316]:
# Returns the condional statement as a DataFrame consisting of bool type; however, it's hard to read the output

df > 10

Unnamed: 0,var1,var2,var3,var4
a,False,True,True,False
b,True,False,True,False
c,True,True,True,True
d,True,True,True,True
e,True,True,True,True
f,True,True,False,True
g,True,True,True,True
h,False,False,True,True


In [317]:
df[df > 10]

# Returns the values in DataFrame for those that meet the condition, and NaN for those that don't. 
# Since the default dtype value of NaN is float old, it made all other values float.

Unnamed: 0,var1,var2,var3,var4
a,,11.0,39.0,
b,19.0,,16.0,
c,13.0,18.0,12.0,16.0
d,34.0,30.0,25.0,37.0
e,20.0,36.0,31.0,11.0
f,21.0,28.0,,23.0
g,27.0,24.0,38.0,23.0
h,,,19.0,29.0


In [318]:
# It returns the observations (rows) that meets the condition for "var1" column

df[df['var1'] > 10]

Unnamed: 0,var1,var2,var3,var4
b,19,8,16,1
c,13,18,12,16
d,34,30,25,37
e,20,36,31,11
f,21,28,9,23
g,27,24,38,23


In [319]:
# It returns the observations (rows) in "var2" column in accordance with meeting the condition for "var1" column

df[df['var1'] > 10]['var2']

b     8
c    18
d    30
e    36
f    28
g    24
Name: var2, dtype: int32

In [320]:
# It returns the observations (rows) in "var2" column as a DataFrame in accordance with meeting the condition for "var1" column

df[df['var1'] > 10][['var2']]

Unnamed: 0,var2
b,8
c,18
d,30
e,36
f,28
g,24


In [321]:
# It returns the observations (rows) in both "var2" and "var2" columns as a DataFrame in accordance with meeting the condition for "var1" column

df[df['var1'] > 10][['var2', "var3"]]

Unnamed: 0,var2,var3
b,8,16
c,18,12
d,30,25
e,36,31
f,28,9
g,24,38


In [322]:
# Let us remember our df

df

Unnamed: 0,var1,var2,var3,var4
a,8,11,39,10
b,19,8,16,1
c,13,18,12,16
d,34,30,25,37
e,20,36,31,11
f,21,28,9,23
g,27,24,38,23
h,10,3,19,29


In [323]:
# When there is more than one condition, each condition should be used in separate parentheses.

df[(df['var1'] > 10) & (df['var1'] < 20)]

Unnamed: 0,var1,var2,var3,var4
b,19,8,16,1
c,13,18,12,16


In [324]:
df.loc[(df["var1"] > 10), ['var2', 'var3']]

Unnamed: 0,var2,var3
b,8,16
c,18,12
d,30,25
e,36,31
f,28,9
g,24,38


In [325]:
# Using conditionals with loc[]: Returns "var2" and "var3" columns that match the condition created for "var1" column. 
# Multiple conditions can also be used.

df.loc[((df["var1"] < 10) | (df["var1"] > 30)), ['var2','var3']]

Unnamed: 0,var2,var3
a,11,39
d,30,25


In [326]:
df.loc[((df["var2"] < 15) | (df["var2"] > 25)), ['var2','var3']]

Unnamed: 0,var2,var3
a,11,39
b,8,16
d,30,25
e,36,31
f,28,9
h,3,19


In [327]:
# Let's remember our df

df

Unnamed: 0,var1,var2,var3,var4
a,8,11,39,10
b,19,8,16,1
c,13,18,12,16
d,34,30,25,37
e,20,36,31,11
f,21,28,9,23
g,27,24,38,23
h,10,3,19,29


In [328]:
# reset_index() method resets the index of the DataFrame, and use the default one instead. 
# If the DataFrame has a MultiIndex, this method can remove one or more levels.

df.reset_index()

# Notice that it makes the values in the old index a new column

Unnamed: 0,index,var1,var2,var3,var4
0,a,8,11,39,10
1,b,19,8,16,1
2,c,13,18,12,16
3,d,34,30,25,37
4,e,20,36,31,11
5,f,21,28,9,23
6,g,27,24,38,23
7,h,10,3,19,29


In [329]:
# If we do not want to see the old index as a new column, we can set the drop parameter to True

df.reset_index(drop=True)

Unnamed: 0,var1,var2,var3,var4
0,8,11,39,10
1,19,8,16,1
2,13,18,12,16
3,34,30,25,37
4,20,36,31,11
5,21,28,9,23
6,27,24,38,23
7,10,3,19,29


In [330]:
# If we want the change to be permanent, we must set the "inplace" parameter to True

df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,var1,var2,var3,var4
0,8,11,39,10
1,19,8,16,1
2,13,18,12,16
3,34,30,25,37
4,20,36,31,11
5,21,28,9,23
6,27,24,38,23
7,10,3,19,29


In [331]:
# set_index() method set the DataFrame index (row labels) using one or more existing columns or arrays (of the correct length). 
# The index can replace the existing index or expand on it.

df.set_index('var4')

Unnamed: 0_level_0,var1,var2,var3
var4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10,8,11,39
1,19,8,16
16,13,18,12
37,34,30,25
11,20,36,31
23,21,28,9
23,27,24,38
29,10,3,19


In [332]:
# Makes "var4" column a new index permanently

df.set_index('var4', inplace=True)
df

Unnamed: 0_level_0,var1,var2,var3
var4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10,8,11,39
1,19,8,16
16,13,18,12
37,34,30,25
11,20,36,31
23,21,28,9
23,27,24,38
29,10,3,19


In [333]:
# Index Levels

outside = ['M1', 'M1', 'M1', 'M2', 'M2', 'M2','M3', 'M3', 'M3']
inside = [1, 2, 3, 1, 2, 3, 5, 6, 7]
multi_index = list(zip(outside, inside))
multi_index

[('M1', 1),
 ('M1', 2),
 ('M1', 3),
 ('M2', 1),
 ('M2', 2),
 ('M2', 3),
 ('M3', 5),
 ('M3', 6),
 ('M3', 7)]

In [334]:
hier_index = pd.MultiIndex.from_tuples(multi_index)
hier_index

# We created multiindex with MultiIndex.from_tuples() method from the values in a tuple

MultiIndex([('M1', 1),
            ('M1', 2),
            ('M1', 3),
            ('M2', 1),
            ('M2', 2),
            ('M2', 3),
            ('M3', 5),
            ('M3', 6),
            ('M3', 7)],
           )

In [335]:
np.random.seed(101)

df = pd.DataFrame(np.random.randn(9, 4), index=hier_index, columns=['A', 'B', 'C', 'D'])
df

# We created a multi-index df

Unnamed: 0,Unnamed: 1,A,B,C,D
M1,1,2.70685,0.628133,0.907969,0.503826
M1,2,0.651118,-0.319318,-0.848077,0.605965
M1,3,-2.018168,0.740122,0.528813,-0.589001
M2,1,0.188695,-0.758872,-0.933237,0.955057
M2,2,0.190794,1.978757,2.605967,0.683509
M2,3,0.302665,1.693723,-1.706086,-1.159119
M3,5,-0.134841,0.390528,0.166905,0.184502
M3,6,0.807706,0.07296,0.638787,0.329646
M3,7,-0.497104,-0.75407,-0.943406,0.484752


In [336]:
df.index.names

# We wanted to see the names of the index columns, but it gave None because there is no name at the moment.

FrozenList([None, None])

In [337]:
df.index.names = ['Group', 'Num']
df.index.names

# We have assigned index names. Respectively, "Group" for the outer level and "Name" for the inner level were given as the index name.

FrozenList(['Group', 'Num'])

In [338]:
# After this assignment, let's check the names of each Multiindex level in the DataFrame

df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M1,1,2.70685,0.628133,0.907969,0.503826
M1,2,0.651118,-0.319318,-0.848077,0.605965
M1,3,-2.018168,0.740122,0.528813,-0.589001
M2,1,0.188695,-0.758872,-0.933237,0.955057
M2,2,0.190794,1.978757,2.605967,0.683509
M2,3,0.302665,1.693723,-1.706086,-1.159119
M3,5,-0.134841,0.390528,0.166905,0.184502
M3,6,0.807706,0.07296,0.638787,0.329646
M3,7,-0.497104,-0.75407,-0.943406,0.484752


In [339]:
# Since there are two level indexes, index attribute returns index values as pairs in a tuple.

df.index

MultiIndex([('M1', 1),
            ('M1', 2),
            ('M1', 3),
            ('M2', 1),
            ('M2', 2),
            ('M2', 3),
            ('M3', 5),
            ('M3', 6),
            ('M3', 7)],
           names=['Group', 'Num'])

In [340]:
df.index.levels

# We saw the unique list of index values in each level

FrozenList([['M1', 'M2', 'M3'], [1, 2, 3, 5, 6, 7]])

In [341]:
df.index.get_level_values(0)

# We've seen all the names at index level 0

Index(['M1', 'M1', 'M1', 'M2', 'M2', 'M2', 'M3', 'M3', 'M3'], dtype='object', name='Group')

In [342]:
df.index.get_level_values("Group")

# We have seen all the values in the index named "Group"

Index(['M1', 'M1', 'M1', 'M2', 'M2', 'M2', 'M3', 'M3', 'M3'], dtype='object', name='Group')

Now let's show how to index this! For index hierarchy we use **``df.loc[]``**, if this was on the columns axis, you would just use normal bracket notation **``df[]``**. Calling one level of the index returns the sub-dataframe:

In [343]:
df[["A"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,A
Group,Num,Unnamed: 2_level_1
M1,1,2.70685
M1,2,0.651118
M1,3,-2.018168
M2,1,0.188695
M2,2,0.190794
M2,3,0.302665
M3,5,-0.134841
M3,6,0.807706
M3,7,-0.497104


In [344]:
df[["A", "B"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
M1,1,2.70685,0.628133
M1,2,0.651118,-0.319318
M1,3,-2.018168,0.740122
M2,1,0.188695,-0.758872
M2,2,0.190794,1.978757
M2,3,0.302665,1.693723
M3,5,-0.134841,0.390528
M3,6,0.807706,0.07296
M3,7,-0.497104,-0.75407


In [345]:
df.loc['M1']

Unnamed: 0_level_0,A,B,C,D
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2.70685,0.628133,0.907969,0.503826
2,0.651118,-0.319318,-0.848077,0.605965
3,-2.018168,0.740122,0.528813,-0.589001


In [346]:
df.loc[("M1", 2)]

A    0.651118
B   -0.319318
C   -0.848077
D    0.605965
Name: (M1, 2), dtype: float64

In [347]:
df.loc[[("M1", 2)]]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M1,2,0.651118,-0.319318,-0.848077,0.605965


In [348]:
df.loc["M1", "A":"C"]

Unnamed: 0_level_0,A,B,C
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2.70685,0.628133,0.907969
2,0.651118,-0.319318,-0.848077
3,-2.018168,0.740122,0.528813


In [349]:
df.loc[[("M1", 2)], "A":"C"]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1,2,0.651118,-0.319318,-0.848077


In [350]:
df.loc["M1":"M2"]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M1,1,2.70685,0.628133,0.907969,0.503826
M1,2,0.651118,-0.319318,-0.848077,0.605965
M1,3,-2.018168,0.740122,0.528813,-0.589001
M2,1,0.188695,-0.758872,-0.933237,0.955057
M2,2,0.190794,1.978757,2.605967,0.683509
M2,3,0.302665,1.693723,-1.706086,-1.159119


In [351]:
df.loc[("M1", 2):("M2", 3)]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M1,2,0.651118,-0.319318,-0.848077,0.605965
M1,3,-2.018168,0.740122,0.528813,-0.589001
M2,1,0.188695,-0.758872,-0.933237,0.955057
M2,2,0.190794,1.978757,2.605967,0.683509
M2,3,0.302665,1.693723,-1.706086,-1.159119


In [352]:
df.loc[("M1", 2): "M2"]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M1,2,0.651118,-0.319318,-0.848077,0.605965
M1,3,-2.018168,0.740122,0.528813,-0.589001
M2,1,0.188695,-0.758872,-0.933237,0.955057
M2,2,0.190794,1.978757,2.605967,0.683509
M2,3,0.302665,1.693723,-1.706086,-1.159119


In [353]:
df.loc[[("M2", 3), ("M3", 5)]]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M2,3,0.302665,1.693723,-1.706086,-1.159119
M3,5,-0.134841,0.390528,0.166905,0.184502


In [354]:
import seaborn as sns

In [355]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

In [131]:
df = sns.load_dataset('iris')
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [358]:
df1 = sns.load_dataset('titanic')

df1

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [132]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [133]:
df.shape

(150, 5)

In [134]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [135]:
df.sample(4)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
30,4.8,3.1,1.6,0.2,setosa
66,5.6,3.0,4.5,1.5,versicolor
117,7.7,3.8,6.7,2.2,virginica
96,5.7,2.9,4.2,1.3,versicolor


In [136]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [137]:
# df.describe().T

df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sepal_length,150.0,5.843333,0.828066,4.3,5.1,5.8,6.4,7.9
sepal_width,150.0,3.057333,0.435866,2.0,2.8,3.0,3.3,4.4
petal_length,150.0,3.758,1.765298,1.0,1.6,4.35,5.1,6.9
petal_width,150.0,1.199333,0.762238,0.1,0.3,1.3,1.8,2.5


In [138]:
df.describe(include="all")

# "number" and "object" can be used as include/exclude parameter

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
count,150.0,150.0,150.0,150.0,150
unique,,,,,3
top,,,,,setosa
freq,,,,,50
mean,5.843333,3.057333,3.758,1.199333,
std,0.828066,0.435866,1.765298,0.762238,
min,4.3,2.0,1.0,0.1,
25%,5.1,2.8,1.6,0.3,
50%,5.8,3.0,4.35,1.3,
75%,6.4,3.3,5.1,1.8,


In [139]:
df.corr()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
sepal_length,1.0,-0.11757,0.871754,0.817941
sepal_width,-0.11757,1.0,-0.42844,-0.366126
petal_length,0.871754,-0.42844,1.0,0.962865
petal_width,0.817941,-0.366126,0.962865,1.0


In [140]:
df.corr()[["sepal_length"]]

Unnamed: 0,sepal_length
sepal_length,1.0
sepal_width,-0.11757
petal_length,0.871754
petal_width,0.817941


In [141]:
df['petal_length'].corr(df["petal_width"])

0.962865431402796

In [142]:
df.species.value_counts(dropna=False)

setosa        50
versicolor    50
virginica     50
Name: species, dtype: int64

In [143]:
df['species'].value_counts(dropna=False, normalize=True)

setosa        0.333333
versicolor    0.333333
virginica     0.333333
Name: species, dtype: float64

In [144]:
df.species.unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [145]:
df.species.nunique()

3

In [146]:
df.loc[df["species"] == "setosa", "sepal_length"]

0     5.1
1     4.9
2     4.7
3     4.6
4     5.0
5     5.4
6     4.6
7     5.0
8     4.4
9     4.9
10    5.4
11    4.8
12    4.8
13    4.3
14    5.8
15    5.7
16    5.4
17    5.1
18    5.7
19    5.1
20    5.4
21    5.1
22    4.6
23    5.1
24    4.8
25    5.0
26    5.0
27    5.2
28    5.2
29    4.7
30    4.8
31    5.4
32    5.2
33    5.5
34    4.9
35    5.0
36    5.5
37    4.9
38    4.4
39    5.1
40    5.0
41    4.5
42    4.4
43    5.0
44    5.1
45    4.8
46    5.1
47    4.6
48    5.3
49    5.0
Name: sepal_length, dtype: float64

In [147]:
df[(df.sepal_length > 4) & (df.sepal_length < 5)]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
6,4.6,3.4,1.4,0.3,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa
11,4.8,3.4,1.6,0.2,setosa
12,4.8,3.0,1.4,0.1,setosa
13,4.3,3.0,1.1,0.1,setosa
22,4.6,3.6,1.0,0.2,setosa


In [148]:
df[(df.species == "virginica") & (df.sepal_length > 4)  & (df.sepal_length < 5)]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
106,4.9,2.5,4.5,1.7,virginica


In [149]:
df.sort_values(by='sepal_length', ascending=True)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
13,4.3,3.0,1.1,0.1,setosa
42,4.4,3.2,1.3,0.2,setosa
38,4.4,3.0,1.3,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
41,4.5,2.3,1.3,0.3,setosa
...,...,...,...,...,...
122,7.7,2.8,6.7,2.0,virginica
118,7.7,2.6,6.9,2.3,virginica
117,7.7,3.8,6.7,2.2,virginica
135,7.7,3.0,6.1,2.3,virginica
