# Python for Statistical Analyis

In [None]:
import pandas as pd
import seaborn as sns
import hvplot.pandas
from matplotlib import pyplot as plt

## The Iris Dataset

In [None]:
data = sns.load_dataset('iris')

In [None]:
data

In [None]:
type(data)

## Exploratory Analysis

In [None]:
data.head()

In [None]:
data['species'].unique()

### Summary Statistics

In [None]:
data.describe()

### Summary Statistics By Group

In [None]:
data.groupby('species').count()

In [None]:
data.groupby('species').mean()

In [None]:
data.groupby('species').min()

In [None]:
data.groupby('species').max()

## Data Visualization

### Scatter Matrices

In [None]:
pd.plotting.scatter_matrix(data, figsize=(10, 10))
plt.show()

In [None]:
sns.pairplot(data, hue='species')
plt.show()

In [None]:
hvplot.scatter_matrix(data, c='species', diagonal='kde')

### Grouped Box Plots

In [None]:
data_long = pd.melt(data, id_vars='species')
data_long

In [None]:
(data_long.groupby('variable')
          .boxplot(by='species', sharey=False, figsize=(10, 10)))
plt.show()

In [None]:
sns.catplot(data=data_long, x='species', y='value', sharey=False,
            col='variable', col_wrap=2, kind='box')
plt.show()

In [None]:
data_long.hvplot.box(y='value', by='species', groupby='variable', height=400)

## ANOVA

### ANOVA using SciPy

In [None]:
import scipy.stats as stats

In [None]:
grouped = data.groupby('species')['sepal_width']

In [None]:
rows = []
for name, values in grouped:
    statistic, pvalue = stats.shapiro(values)
    rows.append({'species': name, 'statistic': statistic, 'pvalue': pvalue})
pd.DataFrame(rows).set_index('species')

In [None]:
for name, values in grouped:
    stats.probplot(values, plot=plt)
    plt.title(name)
    plt.show()

In [None]:
stats.bartlett(grouped.get_group('setosa'), 
               grouped.get_group('versicolor'),
               grouped.get_group('virginica'))

In [None]:
stats.f_oneway(grouped.get_group('setosa'), 
               grouped.get_group('versicolor'),
               grouped.get_group('virginica'))

### ANOVA using Statsmodels

In [None]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
import statsmodels.stats.multicomp as mc

In [None]:
model = ols("sepal_width ~ C(species)", data=data).fit()
model.summary()

In [None]:
sm.qqplot(model.resid)
plt.show()

In [None]:
anova = anova_lm(model)
anova

In [None]:
comp = mc.MultiComparison(data['sepal_width'], data['species'])
post_hoc_res = comp.tukeyhsd()
post_hoc_res.summary()

### ANOVA using Pingouin

In [None]:
import pingouin as pg

In [None]:
for name, values in data.groupby('species')['sepal_width']:
    pg.qqplot(values)
    plt.title(name)
    plt.show()

In [None]:
pg.normality(data, dv='sepal_width', group='species')

In [None]:
pg.homoscedasticity(data, dv='sepal_width', group='species')

In [None]:
pg.anova(data, dv='sepal_width', between='species', detailed=True)