In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

## Import CSV data from CSV

### Import CSV data from CSV with header

In [5]:
# Read in the data from the csv file with Pandas
height_weight_df = pd.read_csv('../data/Weight_Height.csv')


# Data From CDC
# National Health and Nutrition Examination Survey
# 2017-March 2020 Data Documentation, Codebook, and Frequencies
# Body Measures (P_BMX)
# Data File: P_BMX.xpt
# First Published: May 2021
# Last Revised: NA
# https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/P_BMX.htm

Want more information on the read_csv function? 

[Check out the Pandas read_csv documentation](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html)


## Basic Panda Operations

In [6]:
# Use the head() method to display the first 5 rows of the data

height_weight_df.head(5)

Unnamed: 0,index,Weight (kg),Standing Height (cm),BMI(kg/m**2)
0,0,97.1,160.2,37.8
1,1,98.8,182.3,29.7
2,2,74.3,184.2,21.9
3,3,103.7,185.3,30.2
4,4,83.3,177.1,26.6


In [7]:
# Use the tail() method to display the last 5 rows of the data
height_weight_df.tail(5)

Unnamed: 0,index,Weight (kg),Standing Height (cm),BMI(kg/m**2)
8383,8383,94.3,178.8,29.5
8384,8384,82.8,147.8,37.9
8385,8385,108.8,168.7,38.2
8386,8386,79.5,176.4,25.5
8387,8387,59.7,167.5,21.3


In [8]:
# Use the info() method to display the data types of each column
height_weight_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8388 entries, 0 to 8387
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 8388 non-null   int64  
 1   Weight (kg)           8388 non-null   float64
 2   Standing Height (cm)  8388 non-null   float64
 3   BMI(kg/m**2)          8388 non-null   float64
dtypes: float64(3), int64(1)
memory usage: 262.2 KB


In [9]:
# Drop unnecessary columns
# Use the drop() method to drop the 'index' column  from the data  

height_weight_clean_df = height_weight_df.drop('index', axis=1)

## Basic Statistical Functions for Use in Pandas

### Describe function

In [10]:
# Use the describe() method to display some basic statistical details like percentile, mean, std etc. of numeric columns
height_weight_clean_df.describe()

Unnamed: 0,Weight (kg),Standing Height (cm),BMI(kg/m**2)
count,8388.0,8388.0,8388.0
mean,83.669218,166.64119,30.034859
std,23.171638,10.079013,7.565376
min,32.6,131.1,14.2
25%,67.6,159.1,24.9
50%,79.9,166.2,28.8
75%,95.8,173.9,33.8
max,254.3,199.6,92.3


In [11]:
# Use the describe() method to display some basic statistical details from 1 column
height_weight_clean_df['Standing Height (cm)'].describe()

count    8388.000000
mean      166.641190
std        10.079013
min       131.100000
25%       159.100000
50%       166.200000
75%       173.900000
max       199.600000
Name: Standing Height (cm), dtype: float64

In [12]:
# Use the describe() method to display some basic statistical details from 2 or More column
height_weight_clean_df[['Standing Height (cm)', 'Weight (kg)']].describe()

Unnamed: 0,Standing Height (cm),Weight (kg)
count,8388.0,8388.0
mean,166.64119,83.669218
std,10.079013,23.171638
min,131.1,32.6
25%,159.1,67.6
50%,166.2,79.9
75%,173.9,95.8
max,199.6,254.3


### Other Built in Statstical Functions

Individual functions built into pandas data frame for use 
- `count()`: Counts the number of non-NA/null observations.
- `sum()`: Sums the values for the requested axis.
- `mean()`: Computes the mean of the values.
- `median()`: Computes the median of the values.
- `min()`: Finds the minimum value.
- `max()`: Finds the maximum value.
- `mode()`: Computes the mode of the values.
- `std()`: Computes the standard deviation of the values.
- `var()`: Computes the variance of the values.
- `sem()`: Computes the standard error of the mean of the values.
- `describe()`: Generates descriptive statistics that summarize the central tendency, dispersion, and shape of a dataset’s distribution.
- `quantile()`: Computes the quantile of the values.
- `cumsum()`: Computes the cumulative sum of the values.
- `cumprod()`: Computes the cumulative product of the values.
- `cummax()`: Computes the cumulative maximum of the values.
- `cummin()`: Computes the cumulative minimum of the values.
- `skew()`: Computes the skewness of the values, a measure of the asymmetry of the probability distribution.
- `kurt()`: Computes the kurtosis of the values, a measure of the "tailedness" of the probability distribution.
- `corr()`: Computes the correlation between columns in a DataFrame.
- `cov()`: Computes the covariance between columns in a DataFrame.


### Mean and Median Examples 

#### Mean

Average of the set:  ( sum of all items )  / ( total number of means)

In [13]:
# Use the mean() method to display the mean of all columns
height_weight_clean_df.mean()

Weight (kg)              83.669218
Standing Height (cm)    166.641190
BMI(kg/m**2)             30.034859
dtype: float64

In [14]:
# Use the mean() method to display the mean of a columns
height_weight_clean_df['Standing Height (cm)'].mean()

166.64118979494518

In [15]:
# Use the mean() method to display the mean of 2 or more columns
height_weight_clean_df[['Standing Height (cm)','Weight (kg)']].mean()

Standing Height (cm)    166.641190
Weight (kg)              83.669218
dtype: float64

#### Median or 50% percentile

Median : 

In [16]:
# Use the median() method to display the median of all columns
height_weight_clean_df.median()

Weight (kg)              79.9
Standing Height (cm)    166.2
BMI(kg/m**2)             28.8
dtype: float64

#### Mode

The most number of times a value occurs in a dataset.  Take the counts of every single value in a dataset and the value with the highest count is the mode.
Generally used for categorical data or discrete data.

In [17]:
# Use the mode() method to display the median of all columns
height_weight_clean_df.mode()

Unnamed: 0,Weight (kg),Standing Height (cm),BMI(kg/m**2)
0,75.8,164.6,29.1


#### Range

The Range is the difference between the max value in a data set and the highest.  This function is not part of the core pandas functions so we have write our own. 

In [18]:
# lets define a function to calculate the range. The range is the difference between the maximum and minimum values.
def range_f(column):
    return column.max() - column.min()



In [19]:
# Use the apply() method to apply the range() function to all columns
height_weight_clean_df.apply(range_f)

Weight (kg)             221.7
Standing Height (cm)     68.5
BMI(kg/m**2)             78.1
dtype: float64

### Creating Your Own Describe Function

If you need additional functions or different functions to describe your data, we can make new functions and use them with the `.agg` function.  This allows you to use other modules as well. 

In [20]:
# Use the agg() method to choose what columns and which aggregation functions to use on them including our Range function
height_weight_clean_df.agg(['count','mean','median','min', 'max', range_f])

Unnamed: 0,Weight (kg),Standing Height (cm),BMI(kg/m**2)
count,8388.0,8388.0,8388.0
mean,83.669218,166.64119,30.034859
median,79.9,166.2,28.8
min,32.6,131.1,14.2
max,254.3,199.6,92.3
range_f,221.7,68.5,78.1


In [21]:
# Use the agg() method with mode and see what happens!!!  Explain! 

height_weight_clean_df.agg(['count','mean','median','min', 'max', range_f, 'mode'])

ValueError: cannot combine transform and aggregation operations

In [None]:
## Pratical Question:  Is it better to know a companies Median or Mean when estimating your salary at the position? 