In [19]:
import pandas as pd
import seaborn as sns

df = sns.load_dataset('iris')
# more datasets available at seaborn github
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [3]:
df.describe() # generate descriptive statistics

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [9]:
# only mean
df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']].mean()

sepal_length    5.843333
sepal_width     3.057333
petal_length    3.758000
petal_width     1.199333
dtype: float64

In [11]:
# count of unique values in the 'species' column
df['species'].value_counts()

species
setosa        50
versicolor    50
virginica     50
Name: count, dtype: int64

In [12]:
# aggregrate values

df.groupby(by = 'species').mean()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.006,3.428,1.462,0.246
versicolor,5.936,2.77,4.26,1.326
virginica,6.588,2.974,5.552,2.026


In [13]:
df.groupby(by='species').agg(func = {'sepal_length':['mean', 'median'], 'sepal_width':'sum', 'petal_length':['count','max']})

Unnamed: 0_level_0,sepal_length,sepal_length,sepal_width,petal_length,petal_length
Unnamed: 0_level_1,mean,median,sum,count,max
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
setosa,5.006,5.0,171.4,50,1.9
versicolor,5.936,5.9,138.5,50,5.1
virginica,6.588,6.5,148.7,50,6.9


In [21]:
df.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [20]:
df.loc[::5, 'sepal_length']

0      5.1
5      5.4
10     5.4
15     5.7
20     5.4
25     5.0
30     4.8
35     5.0
40     5.0
45     4.8
50     7.0
55     5.7
60     5.0
65     6.7
70     5.9
75     6.6
80     5.5
85     6.0
90     5.5
95     5.7
100    6.3
105    7.6
110    6.5
115    6.4
120    6.9
125    7.2
130    7.4
135    7.7
140    6.7
145    6.7
Name: sepal_length, dtype: float64

In [22]:
df.loc[::5, 'sepal_length'] = None

In [23]:
df.loc[::5, 'sepal_length']

0     NaN
5     NaN
10    NaN
15    NaN
20    NaN
25    NaN
30    NaN
35    NaN
40    NaN
45    NaN
50    NaN
55    NaN
60    NaN
65    NaN
70    NaN
75    NaN
80    NaN
85    NaN
90    NaN
95    NaN
100   NaN
105   NaN
110   NaN
115   NaN
120   NaN
125   NaN
130   NaN
135   NaN
140   NaN
145   NaN
Name: sepal_length, dtype: float64

In [24]:
df.isnull().sum()

sepal_length    30
sepal_width      0
petal_length     0
petal_width      0
species          0
dtype: int64

### Impute missing value

In [25]:
## Best practice

df['sepal_length'].describe()

count    120.000000
mean       5.799167
std        0.812041
min        4.300000
25%        5.100000
50%        5.800000
75%        6.325000
max        7.900000
Name: sepal_length, dtype: float64

In [27]:
df['sepal_length'].fillna(df['sepal_length'].median(), inplace = True) # choose median over mean

In [28]:
df['sepal_length'].describe()

count    150.000000
mean       5.799333
std        0.725702
min        4.300000
25%        5.225000
50%        5.800000
75%        6.275000
max        7.900000
Name: sepal_length, dtype: float64

### Covariance and corelation

In [30]:
# Exclude non-numeric columns
numeric_df = df.select_dtypes(include='number')

numeric_df.cov()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
sepal_length,0.526644,-0.041774,0.995341,0.39651
sepal_width,-0.041774,0.189979,-0.329656,-0.121639
petal_length,0.995341,-0.329656,3.116278,1.295609
petal_width,0.39651,-0.121639,1.295609,0.581006


In [32]:
numeric_df.corr()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
sepal_length,1.0,-0.132066,0.776954,0.716811
sepal_width,-0.132066,1.0,-0.42844,-0.366126
petal_length,0.776954,-0.42844,1.0,0.962865
petal_width,0.716811,-0.366126,0.962865,1.0


### outlier detection mathematically
-  using std deviation method

In [42]:
mean_value = numeric_df.mean()
std_value = numeric_df.std()

threshold = 3

outliers = numeric_df[(numeric_df-mean_value).abs() > threshold + std_value]

outliers

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,,,,
1,,,,
2,,,,
3,,,,
4,,,,
...,...,...,...,...
145,,,,
146,,,,
147,,,,
148,,,,


- IQR

In [45]:
q1 = numeric_df.quantile(0.25)
q3 = numeric_df.quantile(0.75)

iqr = q3 - q1

ll = q1 - 1.5 * iqr
ul = q3 + 1.5 * iqr

outliers = numeric_df[(numeric_df < ll) | (numeric_df > ul)]
outliers.dropna(thresh = 1)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
15,,4.4,,
32,,4.1,,
33,,4.2,,
60,,2.0,,
131,7.9,,,


### Treating outliers

In [50]:
df = df.clip(lower = ll, upper = ul, axis=0) # axis = 0 => update rows

In [51]:
numeric_df = df.select_dtypes(include='number')

q1 = numeric_df.quantile(0.25)
q3 = numeric_df.quantile(0.75)

iqr = q3 - q1

ll = q1 - 1.5 * iqr
ul = q3 + 1.5 * iqr

outliers = numeric_df[(numeric_df < ll) | (numeric_df > ul)]
outliers.dropna(thresh = 1)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
