In [1]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# EDA for the first time

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats as stats

# Iris dataset - Read the dataset from a file using Pandas

In [3]:
filename = "data/iris-data.csv"
df = pd.read_csv(filename, sep='\t')
df.head()

Unnamed: 0,"sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,class"
0,"5.1,3.5,1.4,0.2,Iris-setosa"
1,"4.9,3,1.4,0.2,Iris-setosa"
2,"4.7,3.2,1.3,0.2,Iris-setosa"
3,"4.6,3.1,1.5,0.2,Iris-setosa"
4,"5,3.6,1.4,0.2,Iris-setosa"


**Some problem?** Yes, data are not in colums as we expected.

Here some magic Jupyter commands https://ipython.readthedocs.io/en/stable/interactive/magics.html#

In [4]:
# For linux
%%bash head data/iris-data.csv

UsageError: Line magic function `%%bash` not found.


In [5]:
# For Windows (Powershell)
pwd = %pwd
!Powershell.exe -Command "Get-Content {pwd}\data\iris-data.csv -Head 5"

sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,class
5.1,3.5,1.4,0.2,Iris-setosa
4.9,3,1.4,0.2,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa
4.6,3.1,1.5,0.2,Iris-setosa


In [6]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
df = pd.read_csv(filename, sep=',')
df.head()

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


## Identify problems in data

**The word `class` as a column name ... hm ... Is it one of the reserved Keywords in Python?**

In [7]:
df.class.unique()

SyntaxError: invalid syntax (1430100745.py, line 1)

**How can I write a such code correctly?**

In [8]:
df['class'].unique()

array(['Iris-setosa', 'Iris-setossa', 'Iris-versicolor', 'versicolor',
       'Iris-virginica'], dtype=object)

**Rename the `class` column?**

In [9]:
df.rename(columns = {'class':'species'}, inplace = True)
df.species.unique()

array(['Iris-setosa', 'Iris-setossa', 'Iris-versicolor', 'versicolor',
       'Iris-virginica'], dtype=object)

**Strange values, which look like human mistakes? Rename them? This operation *rename* can be dangerous**

In [10]:
df['species'] = df['species'].str.replace('Iris-setossa','setosa')
df['species'] = df['species'].str.replace('Iris-setosa','setosa')
df['species'] = df['species'].str.replace('Iris-versicolor','versicolor')
df['species'] = df['species'].str.replace('Iris-virginica','virginica')

In [11]:
df.species.unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

**Shorter column names?**

In [None]:
df = df.rename({'sepal_length_cm': 'sepal_length', 'sepal_width_cm': 'sepal_width', 'petal_length_cm': 'petal_length', 'petal_width_cm': 'petal_width'}, axis='columns')
df

In [None]:
df.groupby('species').size()

**Missing values? NaN values?**

In [None]:
df.shape[0]

In [None]:
df.dropna().shape[0]

In [None]:
df.shape[0] - df.dropna().shape[0]

In [None]:
df.isnull().sum()

In [None]:
df.isnull().sum().sum()

In [None]:
df[df.isnull().any(axis=1)]

## Save the dataframe to a file ...

In [None]:
df.to_csv('data/iris-data-output.tsv', sep='\t', index=False, encoding='utf-8')

# Visualization: Iris dataset

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as stats

In [None]:
iris = sns.load_dataset("iris")

### Describe the data together with their characteristics = Descriptive statistics = Deskriptívna štatistika

In [None]:
iris.shape

In [None]:
print(iris.head(10))

In [None]:
iris.info()

In [None]:
iris.describe()

In [None]:
iris.species.unique()

In [None]:
iris.groupby('species').size()

**Univariate analysis** (Mean, Median, Modus, Variance, Standard Deviation) = **Analýza jednotlivých atribútov**

In [None]:
iris['petal_length'].mean()

In [None]:
iris['petal_length'].median()

In [None]:
# Most common value in the given distribution (array)
stats.mode(iris['petal_length'])

In [None]:
# Compute the variance along the specified axis
np.var(iris['petal_length'])

In [None]:
# Compute the standard deviation along the specified axis.
np.std(iris['petal_length'])

### Variance vs Standard deviation

__Variance is the average squared deviations from the mean__ 

__Standard deviation is the square root of this number.__ 

Both measures reflect variability in a distribution, but their units differ:

 - Standard deviation is expressed in the same units as the original values (e.g., minutes or meters).
 - Variance is expressed in much larger units (e.g., meters squared).
 
Although the units of variance are harder to intuitively understand, variance is important in __statistical tests__.

### Formulate and verify data hypotheses = Data visualization + inference statistics

__How to read BoxPlot__: 
 - https://www.wellbeingatschool.org.nz/information-sheet/understanding-and-interpreting-box-plots
 - https://builtin.com/data-science/boxplot

In [None]:
sns.boxplot(data=iris, x="sepal_length", y="species")

In [None]:
sns.boxplot(data=iris, x="petal_length", y="species")

In [None]:
iris.plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False)

In [None]:
iris.hist()

In [None]:
# sns.distplot(iris['petal_length'], bins=10)
sns.displot(iris['petal_length'], bins=10, kde=True)

In [None]:
# sns.distplot(iris['petal_width'], bins=10)
sns.histplot(iris['petal_width'], bins=10)

### Identify relationships between attributes = Dependencies e.g. correlations = Závislosti napr. korelácie 
**Bivariate analysis = Párová analýza**

In [None]:
sns.scatterplot(data=iris, x='petal_length', y='petal_width')

In [None]:
# Plot data and a linear regression model fit
sns.regplot(x="petal_length", y="petal_width", data=iris)

In [None]:
print("Pearson correlation: %.3f" % iris.petal_length.corr(iris.petal_width))

In [None]:
iris.corr()

In [None]:
sns.pairplot(iris, hue="species")

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
sns.heatmap(iris.corr(), ax=ax, annot=True, fmt=".3f")

In [None]:
sns.set(rc={'figure.figsize':(36,8)})
sns.violinplot(data=iris, x='sepal_length', y='sepal_width', hue="species")

### Identify problems in data = Data preprocessing
**Remove missing values?**

In [None]:
iris.shape[0]

In [None]:
iris.dropna().shape[0]

In [None]:
iris.shape[0] - iris.dropna().shape[0]

**Empty rows?**

In [None]:
iris.isnull()

In [None]:
iris[iris.isnull().any(axis=1)]