<a href="https://colab.research.google.com/github/vectrlab/apex-stats-snippets/blob/main/snippet_library.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# APEX STATS Code Snippets

# Data Setup

In [None]:
#@title Setup Blank Data

# Import library
import pandas as pd

# Blank data frame
df = pd.DataFrame()

In [None]:
#@title Setup Example Data: Health Nutrition

# Import library
import pandas as pd

# Read data file: Health Nutrition
data = pd.read_csv('https://raw.githubusercontent.com/vectrlab/apex-stats-datasets/main/healthnutritionandpopulation/example.csv')

# Preview data
data

In [None]:
#@title Setup Example Data: Red Wine Quality

# Import library
import pandas as pd

# Read data file: Red Wine Quality
data = pd.read_csv('https://raw.githubusercontent.com/vectrlab/apex-stats-datasets/main/red-wine-quality/example.csv')

# Preview data
data

In [None]:
#@title Setup Example Data: Connecticut Housing

# Import library
import pandas as pd

# Read data file: Connecticut Housing
data = pd.read_csv('https://raw.githubusercontent.com/vectrlab/apex-stats-datasets/main/connecticut-housing/example.csv')

# Preview data
data

In [None]:
#@title Setup Example Data: FIFA19

# Import library
import pandas as pd

# Read data file: FIFA19
data = pd.read_csv('https://raw.githubusercontent.com/vectrlab/apex-stats-datasets/main/fifa19/example.csv')

# Preview data
data

In [None]:
#@title Setup Example Data with Error Handling

# Import library
import pandas as pd

# Read data file: FIFA19
data = pd.read_csv('https://raw.githubusercontent.com/vectrlab/apex-stats-datasets/main/fifa19/example.csv')

# Handle errors
try:
    data
    print('The data were loaded.')
except NameError:
    print('There was a problem loading the data.')

# Population vs Sample

In [None]:
#@title Generate a Population Distribution

# Import libraries
import random
import pandas as pd

# Set the seed so that everyone has the same values, or
# comment out the line below to generate a unique population each time.
random.seed(21715)

# Set population size
n = 1000

# Specify maximum value
max_value = 100.0

# Generate a uniform distribution with N values,
# from 0 to the maximum value, chosen at random.
population = [random.random() * max_value for x in range(n)]

# Add the population distribution to the data frame.
df = df.assign(x=population)

In [None]:
#@title Population or Sample Size

# Population or sample size
len(data['x'])

In [None]:
#@title Population or Sample Mean

# Import library
import numpy as np

# Mean
np.mean(data['x'])

In [None]:
#@title Population Standard Deviation

# Import library
import numpy as np

# Population standard deviation
np.std(data['x'], ddof=0)

In [None]:
#@title Sample Standard Deviation

# Import library
import numpy as np

# Calculate the sample standard deviation;
# ddof is delta degrees of freedom and
# N - ddof is used in the variance calculation.
np.std(data['x'], ddof=1)

In [None]:
#@title Generate a Random Sample

# Import libraries
import random
import numpy as np

# Sample size
n = 5

# Generate a single sample
one_sample = np.random.choice(data['x'], n)

# Display sample data
one_sample

In [None]:
#@title Generate a Sampling Distribution

# Import libraries
import random
import numpy as np

# Specify number of samples in the sampling distribution
num_samples = 2000

# Sample size
n = 5

# Define a function
def drange():
    # Pick a random starting spot in the distribution
    x = random.randrange(0, len(data['x']) - (n + 1))
    # Select n values starting from that spot -
    # Dave note: We may need to fix this, as the selection
    # will be biased if data are in a nonrandom order
    return slice(x, x + n)

# Assemble the sampling distribution by finding means of repeated samples
sampling_dist = [np.mean(data['x'][drange()]) for x in range(num_samples)]

# Display the sampling distribution
sampling_dist

In [None]:
#@title Calculate Standard Error

# Import library
import numpy as np

# Sample size
n = 5

# Standard error
se = np.std(data['x'], ddof=1) / np.sqrt(n)

# Display result
se

# Confidence Interval

In [None]:
#@title Calculate 95% Confidence Interval, Extended Version

# Import libraries
import numpy as np
from scipy.stats import norm

# Sample size
n = 5

# Standard error
se = np.std(data['x'], ddof=1) / np.sqrt(n)

# Area of one tail outside the confidence interval
tail = 1 - (1 - .95) / 2

# Z-score corresponding to 97.5% area below it
z = norm.ppf(tail)

# Find and save lower and upper bounds
lower_bound = np.mean(data['x']) - z * se
upper_bound = np.mean(data['x']) + z * se

In [None]:
#@title Calculate 95% Confidence Interval, Simple Version

# Import libraries
from scipy import stats
import numpy as np

# Calculate mean
mu = np.mean(data['x'])

# Calculate standard deviation
sigma = np.std(data['x'], ddof=1)

# Specify confidence level
conf_level = 0.95

# Find confidence interval
stats.norm.interval(conf_level, loc=mu, scale=sigma)

# Z-score

In [None]:
#@title Generate Z-scores

# Import library
from scipy import stats

# Create a new data column 'z' containing z-scores
data['z'] = stats.zscore(data['x'])

# Display data column
data['z']

In [None]:
#@title Apply the Z-score Formula to User Inputs

# Formula: z = (score - mean) / pop_sd

# Ask user to insert a new score
raw = float(input('Score = '))

# Ask user to insert a mean
mean = float(input('Mean = '))

# Ask the user to insert standard deviation
sd = float(input('Input the standard deviation: '))

# Find z-score with formula, round to 2 decimal places
z = round((raw - mean) / sd, 2)

# Print result
print(f'z = {z}')

In [None]:
#@title Generate Raw Scores

# Import libraries
import numpy as np
from scipy import stats

# Creat empty list for raw scores
raw_scores = []

# Find population standard deviation
sd = np.std(data['x'])

# Find mean from data column x
mean = data['x'].mean()

# Access each z-score
for z in data['z']:
    # Save each raw score, calculated using formula, into list
    raw_scores.append(z * sd + mean)

# Display result
raw_scores

In [None]:
#@title Apply Raw Score Formula to User Inputs

# Formula: raw score = score * sd + mean

# Input new score
z = float(input('z = '))

# Input mean
mean = float(input('Mean: '))

# Input standard deviation
sd = float(input('Input the standard deviation: '))

# Find z-score using formula, round to 2 decimal places
raw = round(z * sd + mean, 2)

# Print result
print(f'Score = {raw}')

# CDF and Inverse CDF

In [None]:
#@title Cumulative Distribution Function

# Import norm from scipy.stats
from scipy.stats import norm

# CDF of a standard normal distribution
norm.cdf(1.45, loc=0, scale=1)

# When we specify 1.45, we are interested in the area
# under the curve to the left of the point 1.45,
# loc is where you specify your mean, and
# scale is where you specify your standard deviation.

In [None]:
#@title Find Area Under the Curve

# Import norm from scipy.stats
from scipy.stats import norm

# Input mean
mean = float(input('Specify your mean: '))

# Input standard deviation
stdev = float(input('Specify your standard deviation: '))

# Input lower bound
lower_bound = float(input('Specify your lower bound: '))

# Input upper bound
upper_bound = float(input('Specify your upper bound: '))

# Calculate the area under the curve
# by subtracting the larger area (using upper bound)
# from smaller one (using lower bound).
larger_area = norm.cdf(upper_bound, loc=mean, scale=stdev)
smaller_area = norm.cdf(lower_bound, loc=mean, scale=stdev)
area_under_curve = larger_area - smaller_area

# Print result; round answer to two decimal places
print(
    f'Area under the curve from {lower_bound} to {upper_bound} '
    f'is {round(area_under_curve, 2)}'
)

In [None]:
#@title Inverse Cumulative Distribution Function

# Import norm from scipy.stats
from scipy.stats import norm

# Input mean
mean = float(input('Specify your mean: '))

# Input standard deviation
stdev = float(input('Specify your standard deviation: '))

# Input5 percentile
percentile = float(input('Specify the percentile (from 0-1 range): '))

# Find inverse norm result with norm.ppf method
inv_norm_ans = norm.ppf(percentile, loc=mean, scale=stdev)

# Print result; round answer to two decimal places
print(
    f'With mean {mean} and standard deviation {stdev}, '
    f'the {int(percentile*100)}th percentile is {round(inv_norm_ans, 2)}'
)

# norm.ppf returns a *percentile* confidence interval for a one-tailed test.
# For example, when we specify 0.95, we indicated the area under the curve
# to be 0.95 with loc to be our mean and scale to be our standard deviation.
# In R, it is qnorm(0.95, 0, 1) or qnorm(0.05,0,1,lower.tail=FALSE)
# with mean 0 and standard deviation 1.

# Understand Data

In [None]:
#@title Sampling Cases from Data Frame

# Random sampling
new_df = df.sample(n=10, random_state=1)

# Display sample
new_df

In [None]:
#@title Find Missing Values in Data Frame Columns

# Display status of each cell (False: filled with data; True: missing data)
data['x'].isna()

In [None]:
#@title Find Missing Values in Data Frame

# Display status of each cell (False: filled with data; True: missing data)
data.isna()

In [None]:
#@title Find Records Where NaN Exists

# Cases where NaN exists
data['x'][data['x'].isna() == True]

In [None]:
#@title Discover Data Types

# Import libraries
import pandas as pd
import numpy as np

# Create dataframe with 'x' variable; column has integer data
df = pd.DataFrame(np.random.randint(0, 2, size=10), columns=['x'])

# Find datatype of data in df['x'] (data are 0 and 1)
df.dtypes

In [None]:
#@title Convert into Category Data Type

# Convert integers into categories (factors in R)
df['x'] = df['x'].astype('category')

# Diplay data types
df.dtypes

In [None]:
#@title Convert Datatype and Insert a New Column into Data Frame

# Convert integers into categories (factors in R)
data['x6'] = data['x'].astype('category')

# Diplay data types
data.dtypes

# Hypothesis Testing

In [None]:
#@title One Sample t-test

# Import library
from scipy import stats

# One sample t-test (arg1: sample observation, arg2: expected H0 value)
result = stats.ttest_1samp(data['x'], 39)

# To extract details from result, see comments below:
# result.statistic
# result.pvalue

In [None]:
#@title Independent Samples t-test

# Import library
from scipy import stats

# Independent samples t-test (arg1: sample 1, arg2: sample 2)
result = stats.ttest_ind(data['x'], data['x1'])

# To extract details from result, see comments below:
# result.statistic
# result.pvalue

In [None]:
#@title Paired Samples t-test

# Import library
from scipy import stats

# Paired samples t-test (arg1: sample 1, arg2: sample 2)
result = stats.ttest_rel(data['x'], data['x1'])

# To extract details from result, see comments below:
# result.statistic
# result.pvalue

In [None]:
#@title Correlation Analysis

# Import library
from scipy import stats

# Pearson's R (returns a tuple)
corr_test = stats.pearsonr(data['x'], data['x1'])

# r: Pearson's correlation efficient
print(corr_test[0])

# p-value: two-tailed p-val
print(corr_test[1])

In [None]:
#@title Simple Regression

# Import library
from scipy import stats

# Simple regression
result = stats.linregress(data['x'], data['x1'])

# To extract details from result, see comments below:
# result.slope
# result.intercept
# result.rvalue
# result.pvalue
# result.stderr

In [None]:
#@title One-Way Chi-Square Test

# Import library
from scipy.stats import chisquare

# One-way Chi-square
result = chisquare(data['x'])

# Print statistic and p-value
print(result[0])
print(result[1])

In [None]:
#@title One-Way ANOVA

# Import library
from scipy import stats

# One-way ANOVA
result = stats.f_oneway(data['x'], data['x1'])

# Print statistic and p-value
print(result.statistic)
print(result.pvalue)

# Data Visualization

In [None]:
#@title Kernel Density Estimate Plot

# Import library
import seaborn as sns

# KDE plot
sns.kdeplot(data = data['x'], fill=True, bw_adjust=3, color='green')

In [None]:
#@title Dot Plot

# Import library
import seaborn as sns

# Dot plot - test with Red Wine Quality data
sns.stripplot(x=data['y'], y=data['x10'], color='purple')

In [None]:
#@title Box Plot (No Parameter Adjustment)

# Import libraries
import seaborn as sns
import matplotlib.pyplot as plt

# Figure size
plt.figure(figsize=(15, 8))

# Box plot - test with Connecticut Housing data
sns.boxplot(x='y', y='x', data=data)

In [None]:
#@title Box Plot (Adjust Color and Plot One Candlestick)

# Filter data for one county
fairfield = data[data['y'] == 'Fairfield']

# Figure size
plt.figure(figsize=(3, 6))

# Box plot - test with Connecticut Housing data
sns.boxplot(x='y', y='x', data=fairfield, color='green')

In [None]:
#@title Regression Line

# Import libraries
import seaborn as sns
import matplotlib.pyplot as plt

# Linear regression line
sns.lmplot(x='x', y='x1', data=data)

# Save plot
plt.savefig('linear_regression.png')

In [None]:
#@title Histogram (No Parameter Adjustment)

# Import libraries
import seaborn as sns
import matplotlib.pyplot as plt

# Adjust the figure size
plt.figure(figsize=(15, 8))

# Histogram - test with Drinks data
sns.histplot(data=data)

In [None]:
#@title Histogram (Binwidth and KDE)

# Import libraries
import seaborn as sns
import matplotlib.pyplot as plt

# Ajust figure size
plt.figure(figsize=(15, 8))

# Histogram - test with Drinks data
sns.histplot(data=data['x'], binwidth=10, kde=True)

In [None]:
#@title Histogram (Color)

# Import libraries
import seaborn as sns
import matplotlib.pyplot as plt

# Adjust figure size
plt.figure(figsize=(15, 8))

# Histogram - test with Health Nutrition and Population Data
# We have x as a continuous variable
sns.histplot(data=data['x'], binwidth=4, color='brown')

In [None]:
#@title Histogram with automatic binning and color

# Import library
import seaborn as sns

# Histogram
sns.histplot(data['x'])

In [None]:
#@title Histogram with automatic binning and custom color

# For a list of color names:
# https://matplotlib.org/stable/gallery/color/named_colors.html

# Import library
import seaborn as sns

# Get user input on histogram color
custom_color = input('Type the name of a color : ')

# Histogram
sns.histplot(data['x'], color=custom_color, binwidth=1)

In [None]:
#@title Histogram with Custom Binning and Custom Color

# For a list of color names:
# https://matplotlib.org/stable/gallery/color/named_colors.html

# Import library
import seaborn as sns

# Get user input for color
custom_color = input('Type the name of a color : ')

# Get user input for bins
custom_binwidth = int(input('Enter the width of the bins : '))

# Histogram
sns.histplot(data['x'], color=custom_color, binwidth=custom_binwidth)

In [None]:
#@title Count Plot (Horizontal)

# Import libraries
import seaborn as sns
import matplotlib.pyplot as plt

# Bar plot - test with FIFA19 data
plt.figure(figsize=(8, 15))
sns.set_theme(style='darkgrid')
ax = sns.countplot(y='x5', data=data, palette='Set1')

# The style parameter of set_theme method can take
# darkgrid, whitegrid, dark, white, and ticks as args.
# The palette can be Set1, Set2, or Set3.
# Adding a hue parameter will take the data column name
# from the data frame to display legends.

In [None]:
#@title Bar Plot aka Count Plot (Vertical)

# Import libraries
import seaborn as sns
import matplotlib.pyplot as plt

# Bar plot
plt.figure(figsize=(15, 8))
sns.set_theme(style='dark')
ax = sns.countplot(x='x', data=data, palette='Set2')

# The style parameter of the set_theme method can be
# darkgrid, whitegrid, dark, white, and ticks as arguments.
# The palette can be Set1, Set2, or Set3.
# The hue parameter will take the data column name
# from the data frame to display legends.

In [None]:
#@title Count Plot (Customization)

# Import library
import seaborn as sns

# Bar plot - test with Red Wine Quality data
sns.set_theme(style='darkgrid')
ax = sns.countplot(y='y', data=data, linewidth=2,
                  edgecolor=sns.color_palette('dark', 6))

In [None]:
#@title Bar Plot (No Parameter Adjustment)

# Import libraries
import seaborn as sns
import matplotlib.pyplot as plt

# Adjust figure size
plt.figure(figsize=(15, 8))

# Bar plot - test with Connecticut Housing data
sns.barplot(x='y', y='x1', data=data)

In [None]:
#@title Bar Plot (Adjust Color)

# Import libraries
import seaborn as sns
import matplotlib.pyplot as plt

# Adjust figure size
plt.figure(figsize=(15, 8))

# Bar plot (adjust color) - test with Connecticut Housing data
sns.barplot(x='y', y='x1', data=data, color='orange')

In [None]:
#@title Bar plot with One Bar

# Import libraries
import seaborn as sns
import matplotlib.pyplot as plt

# Filter data for one county
fairfield = data[data['y'] == 'Fairfield']

# Adjust figure size
plt.figure(figsize=(3, 6))

# Bar plot - test with Connecticut Housing data
sns.barplot(x='y', y='x1', data=fairfield,
            color='brown', errcolor='red', errwidth=5)

# Data Generation: Discrete and Continuous

In [None]:
#@title Generation of Random Data: Discrete RV (Array Edition)

# Import library
import numpy as np

# Create a generator object
rng = np.random.default_rng(seed=100)

# Specify parameters for generator to produce integers:
# Lower bound (inclusive)
lower_bound = 1

# Upper bound (exlusive)
upper_bound = 7

# Adjust array size
array_size = 1

# Generate a random integer (example: rolling a die)
rand_num = rng.integers(low=lower_bound, high=upper_bound, size=array_size)

# Display result
rand_num

In [None]:
#@title Generation of Random Data: Discrete RV (Matrix Edition)

# Import library
import numpy as np

# Create a generator object
rng = np.random.default_rng(seed=100)

# Specify parameters for generator to produce integers:
# Lower bound (inclusive)
lower_bound = 1

# Upper bound (exlusive)
upper_bound = 7

# Two dimensional array of values
matrix_size = (2, 4)

# Include upper_bound in list of random numbers
list_num = rng.integers(low=lower_bound, high=upper_bound,
                        size=matrix_size, endpoint=True)

# Display result
list_num

In [None]:
#@title Generation of Random Data: Discrete RV (Binomial Distribution)
import numpy as np

# Create a generator object
rng = np.random.default_rng(seed=100)

# Number of trials
num_trials = 10

# Probability of success
p_success = 0.5

# Number of experiments
num_exp = 1000

# Generate sample from the binomial distribution
sample = rng.binomial(num_trials, p_success, size=num_exp)

# Display sample
sample

In [None]:
#@title Generation of Random Data: Continuous RV (Array Edition)

# Import library
import numpy as np

# Create a generator object
rng = np.random.default_rng(seed=42)

# Specify parameters
mean = 3
standard_deviation = 2
sample_size = 20

# Generate sample with default values for loc=0, scale=1, and size=none
sample = rng.normal(loc=mean, scale=standard_deviation, size=sample_size)

# Display sample
sample

In [None]:
#@title Generation of Random Data: Continuous RV (Matrix Edition)

# Import library
import numpy as np

# Create a generator object
rng = np.random.default_rng(seed=42)

# Specify n-dimensional array of samples
matrix_size = (2,4)

# Generate sample
sample = rng.normal(loc=mean, scale=standard_deviation, size=matrix_size)

# Display sample
sample

In [None]:
#@title Generation of Random Data: Continuous RV (Multiple Means)

# Import library
import numpy as np

# Create a generator object
rng = np.random.default_rng(seed=42)

# loc parameter can take an array of values: e.g. [mean1, mean2]
loc_list = [0,5]

# scale parameter can take an array of values: e.g. [std_dev1, std_dev2]
deviation_list = [1,2]

# size parameters has to match the array size in the previous parameters,
# in this example column size (2) matches the array size (2)
row = 5
column = 2

# Generate sample
sample = rng.normal(loc=loc_list, scale=deviation_list, size=(row, column))

# Display sample
sample

# Arithmetics with Dataframe

In [None]:
#@title Adding Two Columns

# Add columns
data['x'].add(data['x1']).head(5)

In [None]:
#@title Subtracting One Column from Another

# Subrtact columns
data['x'].subtract(data['x1']).head(5)

In [None]:
#@title Multiplying Two Columns

# Multiply columns
data['x'].multiply(data['x1']).head(5)

In [None]:
#@title Dividing One Column from Another

# Divide columns
data['x'].divide(data['x1']).head(5)

In [None]:
#@title Summing All Values in Columns with Numeric Values

# Sum all numeric column values
data.sum(numeric_only=True)

In [None]:
#@title Summing All Values for a Specified Column

# Sum all specific column values
data['x'].sum()

In [None]:
#@title Exponentiate a Specified Column

# Exponentiate a specific column
data['x'].pow(2)

In [None]:
#@title Exponentiate Multiple Numeric Columns in Data Set

# Exponentiate multiple numeric columns,
# excluding column y, which has a string data type
data.iloc[ 1: , 1:].pow(2)

In [None]:
#@title Calculate Roots

# Perform square root
data.iloc[ 1: , 1:].pow(1/2)

In [None]:
#@title Create New Column in Data Frame

# Create new column
data['x6'] = data['x'].pow(2)

In [None]:
#@title Rounding for One Data Frame Column

# Round one column
data['x6'].round(1)

In [None]:
#@title Rounding for the Whole Dataframe

# Round the whole dataframe
data.round(1)

In [None]:
#@title Find Missing Values

# Find missing values
bool_series = pd.isnull(data['x'])

# Display result (may or may not show missing data)
data[bool_series]

In [None]:
#@title Sorting the Data Frame

# Sort data frame (this is not an in-place sort)
data.sort_values(by=['y'], ascending=True)

# We can assign the result dataframe to the
# same variable to reflect the change using:
# data = data.sort_values(by=['y'], ascending=True)

In [None]:
#@title Selecting Cases in Data Frame

# Select case
data[data['x'] > 70]

In [None]:
#@title Exclude a Participant Number from the Cases

# Exclude participant number
data[(data['x'] > 70) & (data['x'] != 75)]

# Descriptive Statistics

In [None]:
#@title Generate Descriptive Statistics

# Descriptive stats
data.describe()

In [None]:
#@title Measures of Central Tendency with Missing Data

# Test with Cancer Cases data ('x4' has missing data)
print('Mean: ', data['x4'].mean())
print('Median: ', data['x4'].median())
print('Mode: ', data['x4'].mode())

# For cancer cases dataset, no mode exists in column 'x4'.
# No warnings or errors when performing central tendency measurements.

In [None]:
#@title Mean

# Mean
data.mean()

In [None]:
#@title Median

# Median
data.median()

In [None]:
#@title Mode

# Mode
data.mode()

# Below result may show the data frame due to the
# fact that this particular data frame has no mode.

In [None]:
#@title Minimum

# Minimun value
data.min()

# Minimum in column 'x'
# data['x'].min()

In [None]:
#@title Maximum

# Maximum value
data.max()

# Maximum in column 'x'
# data['x'].max

In [None]:
#@title Range

# Range
data['x'].max() - data['x'].min()

In [None]:
#@title First Quartile

# First quartile
first_quartile = data.describe()['x']['25%']

In [None]:
#@title Second Quartile

# Second quartile
second_quartile = data.describe()['x']['50%']

In [None]:
#@title Third quartile

# Third quartile
third_quartile = data.describe()['x']['75%']

In [None]:
#@title Interquartile Range

# Interquartile range
third_quartile - first_quartile

In [None]:
#@title Variance

# Variance
data['x'].var()

In [None]:
#@title Standard Deviation

# Standard deviation
data['x'].std()

In [None]:
#@title Skewness in One Column

# Skewness for specific column
data['x'].skew()

In [None]:
#@title Measurement of Skewness for All Columns

# Skewness for all columns
data.skew(axis=0, numeric_only=True)

In [None]:
#@title Measurement of Kurtosis

# Kurtosis
data.kurtosis(axis=0, numeric_only=True)

In [None]:
#@title Generate Frequency Table

# Import library
import numpy as np

# Test data: Red Wine Quality
# Create two arrays: one with a set of values, and
# another to indicate the occurences of each value.
unique_vals, occurrences = np.unique(data['y'], return_counts=True)

# Create a Python dictionary object with
# column names as keys and the
# Pandas series (above) as values.
freq_dist_dict = {
    'Value': pd.Series(unique_vals),
    'Frequency': pd.Series(occurrences),
}

# Frequency distribution table generation
freq_table = pd.DataFrame(freq_dist_dict)

# Display frequency table
freq_table

In [None]:
#@title Generate Frequency Table with Relative Frequency

# Import library
import numpy as np

# Below is a function to calculate the percentage of the frequencies.
# Divide each count by total number of counts multiplied by 100.
def calculate_percentage(counts, total):
  percentages = []
  # Iterate through counts array
  for each in counts:
    # Calculate percentage and append list
    percentages.append((each/total)*100)
  # Result list to be returned
  return percentages

# Test data: Wine Quality
# Here, we have 2 ndarrays: one is a set of values, and
# the other indicates the occurences of each value.
unique_vals, occurrences = np.unique(data['y'], return_counts=True)

# Find relative frequency using customized function, calculate_percentage.
rel_freq = calculate_percentage(occurrences, len(data['y']))

# Pandas series objects for each column in data frame to be created later.
s1 = pd.Series(unique_vals)
s2 = pd.Series(occurrences)
s3 = pd.Series(rel_freq)

# Create a Python dictionary object with
# column names as keys and the
# Pandas series (above) as values.
freq_dist_dict = {
    'Value': s1,
    'Frequency': s2,
    'Percent': s3,
}

 # Frequency distribution table
freq_table_w_rel_freq = pd.DataFrame(freq_dist_dict)

In [None]:
#@title Calculate Intervals Around Mean

# Mean
mean = data['x'].mean()

# Standard deviation
sdev = data['x'].std()

# Get user input on number of standard deviation
num_sd = int(input('Number of standard deviation away from mean: '))

# Display interval around mean
(mean - num_sd * sdev, mean + num_sd * sdev)

In [None]:
#@title Calculate Intervals Around Mean By Using Numpy to Find Standard Deviation

# Mean
mean = data['x'].mean()

# Population standard deviation
sdev = np.std(data['x'], ddof=1)

# Set to one standard deviation
(mean - 1 * sdev, mean + 1 * sdev)

In [None]:
#@title Calculate Intervals Around Mean (Numpy and User Input)

# Mean
mean = data['x'].mean()

# Population standard deviation
sdev = np.std(data['x'], ddof=1)

# Get user input on number of standard deviation
num_sd = int(input('Number of standard deviations away from mean: '))

# Set to the number of standard deviations set by user
(mean - num_sd * sdev, mean + num_sd * sdev)

In [None]:
#@title Determine Population Variance

# Verify that data['x'] has no NA before calculation
data['x'].var() * (len(data['x']) - 1) / len(data['x'])

In [None]:
#@title Determine Population Standard Deviation

# Import library
from math import sqrt

# Verify that data['x'] has no NA before calculation
sqrt(data['x'].var() * (len(data['x']) - 1) / len(data['x']))

In [None]:
#@title Add an Outlier Score (Ver. 2)

# Create a copy of column 'x'
with_outlier = data['x']

# Create a new single score equal to three times the mean
new_score = data['x'].mean() * 3

# Add the outlier to the copy
with_outlier = with_outlier.append(pd.Series(new_score))

# Show the mean of the new variable
with_outlier.mean()