# Descriptive Statistics With Python

## Load packages

In [None]:
pip install pandas numpy seaborn

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

## Load data

In [2]:
df = sns.load_dataset("mpg") 

In [3]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger


## Categorical variables

### One-dimensional frequency table

In [4]:
# absolute
df['name'].value_counts()

ford pinto             6
toyota corolla         5
amc matador            5
ford maverick          5
chevrolet chevette     4
                      ..
chevrolet monza 2+2    1
ford mustang ii        1
pontiac astro          1
amc pacer              1
chevy s-10             1
Name: name, Length: 305, dtype: int64

In [5]:
# percentage
df['name'].value_counts(normalize = True)

ford pinto             0.015075
toyota corolla         0.012563
amc matador            0.012563
ford maverick          0.012563
chevrolet chevette     0.010050
                         ...   
chevrolet monza 2+2    0.002513
ford mustang ii        0.002513
pontiac astro          0.002513
amc pacer              0.002513
chevy s-10             0.002513
Name: name, Length: 305, dtype: float64

### Multi-dimensional frequency table

In [6]:
# Number of unique names per origin
pd.pivot_table(data = df, index = "origin", values = ['name'], aggfunc = 'nunique' )

Unnamed: 0_level_0,name
origin,Unnamed: 1_level_1
europe,58
japan,60
usa,187


In [7]:
# Number of unique names per origin per year
pd.pivot_table(data = df, index = ["model_year", "origin"], values = ['name'], aggfunc = 'nunique' )

Unnamed: 0_level_0,Unnamed: 1_level_0,name
model_year,origin,Unnamed: 2_level_1
70,europe,5
70,japan,2
70,usa,22
71,europe,4
71,japan,4
71,usa,20
72,europe,5
72,japan,5
72,usa,18
73,europe,7


In [8]:
# Check
df.query("origin == 'europe' & model_year == 82")

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
375,36.0,4,105.0,74.0,1980,15.3,82,europe,volkswagen rabbit l
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup


## Numerical variables

### df.describe
Most common descriptive statistics for all numerical variables

In [9]:
df.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
count,398.0,398.0,398.0,392.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,104.469388,2970.424623,15.56809,76.01005
std,7.815984,1.701004,104.269838,38.49116,846.841774,2.757689,3.697627
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0
25%,17.5,4.0,104.25,75.0,2223.75,13.825,73.0
50%,23.0,4.0,148.5,93.5,2803.5,15.5,76.0
75%,29.0,8.0,262.0,126.0,3608.0,17.175,79.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0


### Mean

In [10]:
# using Pandas
df['mpg'].mean()

23.514572864321607

In [11]:
# using Numpy
np.mean(df['mpg'])

23.514572864321607

### Median

In [12]:
df['mpg'].median()

23.0

In [13]:
np.median(df['mpg'])

23.0

### Mode

In [14]:
df['mpg'].mode()

0    13.0
dtype: float64

In [15]:
# works also for categorical data
df['origin'].mode()

0    usa
dtype: object

### Range
(largest - smallest value)

In [16]:
df['mpg'].max() - df['mpg'].min()

37.6

### Variance

In [17]:
df['mpg'].var()

61.089610774274405

In [18]:
np.var(df['mpg'])

60.93611928991693

In [19]:
# pandas var has ddof of 1 by default, numpy has it at 0.
# Population variance vs. sample variance
df['mpg'].var(ddof=0)

60.93611928991693

### Standard deviation
(square root of varianve - gives original unit)

In [20]:
np.sqrt(np.var(df['mpg']))

7.806159061274433

In [21]:
np.std(df['mpg'])

7.806159061274433

In [22]:
df['mpg'].std()

7.815984312565782

In [23]:
df['mpg'].std(ddof=0)
# ddof=0 provides a maximum likelihood estimate of the variance for normally distributed variables

7.806159061274433

### Quantiles (Percentiles)

In [24]:
# 25% quantile
df['mpg'].quantile(q = 0.25)

17.5

In [25]:
# 75% quantile
df['mpg'].quantile(q = 0.75)

29.0

In [26]:
# 2 quantiles
df['mpg'].quantile(q = [0.25, 0.75])

0.25    17.5
0.75    29.0
Name: mpg, dtype: float64

In [27]:
q1, q3 = df['mpg'].quantile(q = [0.25, 0.75])

In [28]:
iqr = q3 - q1
iqr

11.5

In [29]:
# Numpy
q3, q1 = np.percentile(df['mpg'], [75 ,25])
iqr = q3 - q1
iqr

11.5

## Automated exploratory data analysis

### Load package

Use [sweetviz](https://pypi.org/project/sweetviz/) to calculate descriptive statistics for all your data
See [documentation here](https://colab.research.google.com/drive/1-md6YEwcVGWVnQWTBirQSYQYgdNoeSWg?usp=sharing)

In [30]:
pip install sweetviz

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sweetviz
  Downloading sweetviz-2.1.4-py3-none-any.whl (15.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.1/15.1 MB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sweetviz
Successfully installed sweetviz-2.1.4


In [31]:
import sweetviz as sv

### Analyze

In [32]:
sv_report = sv.analyze(df)

                                             |          | [  0%]   00:00 -> (? left)

In [33]:
sv_report.show_notebook(w=900, h=450, scale=0.8)

#### Analyze with a target variable

In [34]:
sv_report = sv.analyze(df, "mpg")

                                             |          | [  0%]   00:00 -> (? left)

In [35]:
sv_report.show_notebook(w=900, h=450, scale=0.8)

### Compare two datasets

In [36]:
df1 = df.query("model_year < 76")
df2 = df.query("model_year >= 76")

In [37]:
sv_report = sv.compare(source = [df1, "< 76"], compare = [df2,">= 76"])

                                             |          | [  0%]   00:00 -> (? left)

In [38]:
sv_report.show_notebook(w=900, h=450, scale=0.8)

#### Compare two datasets with target

In [39]:
sv_report = sv.compare(source = [df1, "< 76"], compare = [df2,">= 76"], target_feat = "mpg")

                                             |          | [  0%]   00:00 -> (? left)

In [40]:
sv_report.show_notebook(w=900, h=450, scale=0.8)

### Compare intra
(two variables inside one dataset)

In [41]:
sv_report = sv.compare_intra(df, df['origin'] == 'usa', ["US", "Non-US"])

                                             |          | [  0%]   00:00 -> (? left)

In [42]:
sv_report.show_notebook(w=900, h=450, scale=0.8)

#### Compare variables inside dataset with target

In [43]:
sv_report = sv.compare_intra(df, df['origin'] == 'usa', ["US", "Non-US"], "mpg")

                                             |          | [  0%]   00:00 -> (? left)

In [44]:
sv_report.show_notebook(w=900, h=450, scale=0.8)