<a href="https://colab.research.google.com/github/vectrlab/apex-stats-snippets/blob/main/snippet_library.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# APEX STATS Code Snippets

# Data Setup

In [None]:
#@title Setup Blank Data

# import library
import pandas as pd 
# create a blank dataframe
df = pd.DataFrame() 

In [None]:
#@title Setup Example Data

# import library
import pandas as pd 
# read data file: Health Nutrition
data = pd.read_csv("https://raw.githubusercontent.com/vectrlab/apex-stats-datasets/main/healthnutritionandpopulation/example.csv") 
data # display the data

In [None]:
#@title Setup Example Data

# import library
import pandas as pd 
# Red Wine Quality
data = pd.read_csv("https://raw.githubusercontent.com/vectrlab/apex-stats-datasets/main/red-wine-quality/example.csv") 
# display the data
data 

In [None]:
#@title Setup Example Data

# import library
import pandas as pd 
# read data file: Connecticut Housing
data = pd.read_csv("https://raw.githubusercontent.com/vectrlab/apex-stats-datasets/main/connecticut-housing/example.csv") 
# display the data
data 

In [None]:
#@title Setup Example Data

# import library
import pandas as pd 
# read data file: FIFA19 data
data = pd.read_csv("https://raw.githubusercontent.com/vectrlab/apex-stats-datasets/main/fifa19/example.csv") 
# display the data
data 

In [None]:
#@title Setup Example Data

# import library
import pandas as pd
# read data file
data = pdf.read_csv("https://raw.githubusercontent.com/vectrlab/apex-stats-datasets/main/fifa19/example.csv")
# create try block to catch exception
try:
    data
    print("The data were loaded.")
except NameError:
    print("There was a problem load the data.")

# Population vs Sample

In [None]:
#@title Generate a population distribution

# import libraries
import random 
import pandas as pd 
# set seed so that everyone has the same values, 
# comment out the line below to generate 
# a unique population each time
random.seed(21715) 
# set the population size
N = 1000 
# specigy maximum value
max_value = 100.0
# generate a uniform distribution with N values 0-maxvalue chosen at random
population = [random.random() * max_value for x in range(N)] 
# add the population distribution to the dataframe
df = df.assign(X=population) 

In [None]:
#@title Population or sample size

len(data["x"])

In [None]:
#@title Population or sample mean

# import library
import numpy as np 
# display the mean
np.mean(data["x"]) 

In [None]:
#@title Population standard deviation

# import library
import numpy as np 
# display the population standard deviation
np.std(data["x"], ddof=0) 

In [None]:
#@title Sample standard deviation

# import library
import numpy as np 
# display the sample standard deviation; 
# ddof is delta degrees of freedom and 
# N - ddof is used in the variance calculation
np.std(data["x"], ddof=1) 

In [None]:
#@title Generate a random sample

# import libraries
import random 
import numpy as np
# sample size of 5
n = 5 
# generate a single sample
one_sample = np.random.choice(data["x"], n) 
# show the sample data
one_sample 

In [None]:
#@title Generate a sampling distribution

# import libraries
import random 
import numpy as np
# how many samples to include in the sampling distribution
num_samples = 2000 
# sample size
n = 5
# define a function
def drange():
    # pick a random starting spot in the distribution
    x = random.randrange(0, len(data["x"]) - (n + 1)) 
    # select n values starting from that spot - 
    # Dave note: We may need to fix this, as the selection  
    # will be biased if data are in a nonrandom order
    return slice(x, x + n)

# assemble the sampling distribution by finding means of repeated samples
sampling_dist = [np.mean(data["x"][drange()]) for x in range(num_samples)] 
# show the sampling distribution
sampling_dist

In [None]:
#@title Calculate standard error

# import library
import numpy as np 
# sample size
n = 5 
# find standard error
se = np.std(data["x"], ddof=1) / np.sqrt(n)
# display result
se

# Confidence Interval

In [None]:
#@title Calculate 95% Confidence Interval, extended version
import numpy as np # import library
from scipy.stats import norm
n = 5 # sample size
se = np.std(data["X"], ddof=1) / np.sqrt(n) # standard error
tail = 1 - (1-.95)/2 # area of one tail outside the confidence interval
z = norm.ppf(tail) # z-score corresponding to 97.5% area below it
lower_bound = np.mean(data["X"]) - z * se
upper_bound = np.mean(data["X"]) + z * se

In [None]:
#@title Calculate 95% Confidence Interval, simple version
from scipy import stats # import library
import numpy as np 
mu = np.mean(data["X"]) # mean
sigma = np.std(data["X"], ddof=1) # standard deviation
conf_level = 0.95
stats.norm.interval(conf_level, loc=mu, scale=sigma)

# Z-score

In [None]:
#@title Generate Z-scores

from scipy import stats
# create a new data column z containing z-scores
data['z'] = stats.zscore(data["x"]) 
# show data column
data['z']

In [None]:
#@title Apply The Z-score Formula to User Inputs
# z = (score - mean)/pop_sd

# ask user to insert a new score
raw = float(input("Score = ")) 
# ask user to insert a mean
mean = float(input("Mean = ")) 
# ask the user to insert standard deviation
sd = float(input("Input the standard deviation: ")) 
# find z-score with formula, round to 2 decimal places
z = round((raw - mean)/sd, 2) 
# print result
print(f"z = {z}") 

In [None]:
#@title Generate Raw Scores

import numpy as np
from scipy import stats

# list of raw scores
raw_scores = [] 
# find population standard deviation
sd = np.std(data['x']) 
# find mean from data column x
mean = data['x'].mean() 
# access each z-score
for z in data['z']: 
    # Save each raw score, calculated using formula, into list
    raw_scores.append(z * sd + mean) 

# display result
raw_scores 

In [None]:
#@title Apply Raw Score Formula to User Inputs
# score = score * sd + mean

# ask user to insert a new score
z = float(input("z = ")) 
# ask user to insert mean
mean = float(input("Mean: ")) 
# ask user to insert standard deviation
sd = float(input("Input the standard deviation: ")) 
# find z-score with formula, round to 2 decimal places
raw = round(z * sd + mean, 2) 
# print result
print(f"Score = {raw}") 

# CDF and Inverse CDF

In [None]:
#@title Cumulative Distribution Function

# import norm from scipy.stats
from scipy.stats import norm
# cdf of a standard normal distribution
norm.cdf(1.45, loc=0, scale=1) 

# when we specify 1.45, we are interested in the area 
# under the curve to the left of the point 1.45
# loc is where you specify your mean
# scale is where you specify your standard deviation

In [None]:
#@title Find Area Under the Curve

from scipy.stats import norm
# insert the mean
mean = float(input("Specify your mean: ")) 
# insert the standard deviation
stdev = float(input("Specify your standard deviation: ")) 
# insert the lower bound
lower_bound = float(input("Specify your lower bound: ")) 
# insert the upper bound
upper_bound = float(input("Specify your upper bound: ")) 
# calculate the area under the curve 
# by subtracting the larger area (using upper bound) 
# from smaller one (using lower bound)
larger_area = norm.cdf(upper_bound, loc=mean, scale=stdev)
smaller_area = norm.cdf(lower_bound, loc=mean, scale=stdev)
area_under_curve = larger_area - smaller_area 
# print result; round answer to two decimal places
print(f"Area under the curve from {lower_bound} to {upper_bound} is {round(area_under_curve, 2)}") 

In [None]:
#@title Inverse Cumulative Distribution Function

from scipy.stats import norm
# insert the mean
mean = float(input("Specify your mean: ")) 
# insert the standard deviation
stdev = float(input("Specify your standard deviation: ")) 
# insert percentile
percentile = float(input("Specify the percentile (from 0-1 range): ")) 
# find inverse norm result with norm.ppf method
inv_norm_ans = norm.ppf(percentile, loc=mean, scale=stdev) 
# print result; round answer to two decimal places
print(f"With mean {mean} and standard deviation {stdev}, \
      the {int(percentile*100)}th percentile is {round(inv_norm_ans, 2)}") 

# norm.ppf returns a *percentile* confidence interval for a one-tailed test
# for example, when we specify 0.95, we indicated the area under the curve to be 0.95
# with loc to be our mean and scale to be our standard deviation
# In R, it is qnorm(0.95, 0, 1) or qnorm(0.05,0,1,lower.tail=FALSE) 
# with mean 0 and standard deviation 1.

# Understand Data

In [None]:
#@title Sampling Cases From DataFrame
new_df = df.sample(n=10, random_state=1)
new_df

In [None]:
#@title Find Missing Values in Dataframe Columns
data["X"].isna() # display status of each cell (False: filled with data; True: missing data)

In [None]:
#@title Find Missing Values in DataFrame
data.isna() # display status of each cell (False: filled with data; True: missing data)

In [None]:
#@title Find Records Where NaN Exists
data["X"][data["X"].isna() == True] # Cases where NaN exists

In [None]:
#@title Discover Data Types
import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.randint(0,2,size=10), columns=["X"]) # create a dataframe with X variable; column has integer data
df.dtypes # Find out datatype of data in df["X"] (data are 0 and 1)

In [None]:
#@title Convert into Category Datatype
df["X"] = df["X"].astype("category") # turn integers into categories (factors in R)
df.dtypes

In [None]:
#@title Convert Datatype and Insert a New Column into Dataframe
data["X6"] = data["X"].astype("category") # turn integers into categories (factors in R)
data.dtypes

# Hypothesis Testing

In [None]:
#@title One Sample T-test
from scipy import stats
result = stats.ttest_1samp(data["X"], 39) # arg1: Sample Observation, arg2: Expected value in null hypothesis

# To extract details from result, see comments below
# result.statistic
# result.pvalue

In [None]:
#@title Independent Samples T-test
from scipy import stats
result = stats.ttest_ind(data["X"], data["X1"]) # arg1: sample 1, arg2: sample 2

# To extract details from result, see comments below
# result.statistic
# result.pvalue

In [None]:
#@title Paired Samples T-test
from scipy import stats
result = stats.ttest_rel(data["X"], data["X1"]) # arg1: sample 1, arg2: sample 2

# To extract details from result, see comments below
# result.statistic
# result.pvalue

In [None]:
#@title Correlation Analysis
from scipy import stats
corr_test = stats.pearsonr(data["X"], data["X1"]) # returns a tuple
print(corr_test[0]) # r: Pearson's correlation efficient
print(corr_test[1]) # p-value: two-tailed p-val

In [None]:
#@title Simple Regression
from scipy import stats
result = stats.linregress(data["X"], data["X1"])

# More about result
# result.slope
# result.intercept
# result.rvalue
# result.pvalue
# result.stderr

In [None]:
#@title One-Way Chi-square Test
from scipy.stats import chisquare
result = chisquare(data["X"])
print(result[0]) # statistic
print(result[1]) # pvalue

In [None]:
#@title One-way ANOVA
from scipy import stats
result = stats.f_oneway(data["X"], data["X1"])
print(result.statistic)
print(result.pvalue)

# Data Visualization

In [None]:
#@title Kernel Density Estimate Plot
import seaborn as sns
sns.kdeplot(data=data["x"], fill=True, bw_adjust=3, color="green")

In [None]:
#@title Dot Plot
import seaborn as sns

# Test code with Red wine quality data
# Create and display dot plot
sns.stripplot(data["y"], data["x10"], color="purple") 

In [None]:
#@title Box Plot (No Parameter Adjustment)
import seaborn as sns
import matplotlib.pyplot as plt

# Test code with Connecticut housing data
# adjust figure size
plt.figure(figsize=(15,8))
# draw box plot
sns.boxplot(x="y", y="x", data=data) 

In [None]:
#@title Box Plot (Adjust Color and Plot One Candlestick)

# Test code with Connecticut housing data
fairfield = data[data['y'] == 'Fairfield']
# adjust figure size
plt.figure(figsize=(3,6)) 
# draw box plot
sns.boxplot(x="y", y="x", data=fairfield, color="green") 

In [None]:
#@title Regression Line
import seaborn as sns
import matplotlib.pyplot as plt

# Plot linear regression line
sns.lmplot(x="x", y="x1", data=data) 
# Save Plot
plt.savefig('linear_regression.png') 

In [None]:
#@title Histogram (No Parameter Adjustment)
import seaborn as sns
import matplotlib.pyplot as plt

# Test code with Drinks Data
# Adjust the figure size
plt.figure(figsize=(15,8)) 
# Plot histogram
sns.histplot(data=data) 

In [None]:
#@title Histogram (Binwidth and KDE)
import seaborn as sns
import matplotlib.pyplot as plt

# Test code with Drinks Data
# Adjust figure size
plt.figure(figsize=(15,8)) 
# Plot histogram
sns.histplot(data=data['x'], binwidth=10, kde=True) 

In [None]:
#@title Histogram (Color)
import seaborn as sns
import matplotlib.pyplot as plt

# Test code with Health Nutrition and Population Data
# We have x as a continuous variable

# Adjust figure size
plt.figure(figsize=(15,8)) 
# Plot histogram
sns.histplot(data=data['x'], binwidth=4, color="brown") 

In [None]:
#@title Histogram with automatic binning and color

# import library
import seaborn as sns 
# Display the histogram
sns.histplot(data["x"]) 

In [None]:
#@title Histogram with automatic binning and custom color
# color names that work should include 
# https://matplotlib.org/stable/gallery/color/named_colors.html

# import library
import seaborn as sns 
# Get user input on histogram color
custom_color = input("Type the name of a color : ") 
# Display the histogram
sns.histplot(data["x"], color = custom_color, binwidth = 1) 

In [None]:
#@title Histogram with custom binning and custom color
# color names that work should include 
# https://matplotlib.org/stable/gallery/color/named_colors.html

# import library
import seaborn as sns
# Get user input for color
custom_color = input("Type the name of a color : ") 
# Get user input for bins
custom_binwidth = int(input("Enter the width of the bins : ")) 
# Display the histogram
sns.histplot(data["x"], color = custom_color, binwidth = custom_binwidth) 

In [None]:
#@title Count Plot (Horizontal)
import seaborn as sns
import matplotlib.pyplot as plt

# Test code below with FIFA19 data
plt.figure(figsize=(8,15))
sns.set_theme(style="darkgrid")
ax = sns.countplot(y="x5", data=data,
                  palette="Set1")

# style parameter of set_theme method can take 
# darkgrid, whitegrid, dark, white, and ticks as args
# palette's possible arguments are Set1, Set2, Set3
# hue parameter will take data column name from 
# dataframe to display legends

In [None]:
#@title Count Plot (Vertical)
import seaborn as sns
import matplotlib.pyplot as plt

# Test code below with FIFA19 data
plt.figure(figsize=(15,8))
sns.set_theme(style="dark")
ax = sns.countplot(x="x", data=data,
                   palette="Set2") 

# style parameter of set_theme method can take 
# darkgrid, whitegrid, dark, white, and ticks as args
# palette's possible arguments are Set1, Set2, Set3
# hue parameter will take data column name from 
# dataframe to display legends

In [None]:
#@title Count Plot (Customization)
import seaborn as sns

# Test code with Wine Quality data
sns.set_theme(style="darkgrid")
ax = sns.countplot(y="y", data=data, linewidth=2, 
                  edgecolor=sns.color_palette("dark", 6))

In [None]:
#@title Bar Plot (No Parameter Adjustment)
import seaborn as sns
import matplotlib.pyplot as plt

# Test code with Connecticut housing data
# Adjust figure size
plt.figure(figsize=(15,8)) 
# Draw bar plot
sns.barplot(x="y", y="x1", data=data) 

In [None]:
#@title Bar plot (Adjust Color)
import seaborn as sns
import matplotlib.pyplot as plt

# Test code with Connecticut housing data
# Adjust figure size
plt.figure(figsize=(15,8)) 
# Change color of bar
sns.barplot(x="y", y="x1", data=data, color="orange") 

In [None]:
#@title Bar plot with One Bar
import seaborn as sns
import matplotlib.pyplot as plt

# Test code with Connecticut housing data
# filter data dataframe for information about one county
fairfield = data[data['y'] == 'Fairfield']
plt.figure(figsize=(3,6))
sns.barplot(x="y", y="x1", data=fairfield, 
            color='brown', errcolor="red", errwidth=5)

# Data Generation: Discrete and Continuous

In [None]:
#@title Generation of Random Data: Discrete RV (Array Edition)
import numpy as np

# Create a generator object
rng = np.random.default_rng(seed=100)
# Specify parameters for generator to produce integers
# lower bound (inclusive)
lower_bound = 1
# upper bound (exlusive)
upper_bound = 7
# size of the output array
array_size = 1 # adjust array size
# Generate a random integer
# example: rolling a die
rand_num = rng.integers(low=lower_bound, high=upper_bound, size=array_size)
# Display result
rand_num

In [None]:
#@title Generation of Random Data: Discrete RV (Matrix Edition)
import numpy as np

# Create a generator object
rng = np.random.default_rng(seed=100)
# Specify parameters for generator to produce integers
# lower bound (inclusive)
lower_bound = 1
# upper bound (exlusive)
upper_bound = 7
# Want a two dimensional array of values
matrix_size = (2,4)
# Include upper_bound in list of random numbers
list_num = rng.integers(low=lower_bound, high=upper_bound, size=matrix_size, endpoint=True)
# Display result
list_num

In [None]:
#@title Generation of Random Data: Discrete RV (Binomial Distribution)
import numpy as np

# Create a generator object
rng = np.random.default_rng(seed=100)

# Binomial distribution
# Set parameters
# Number of trials
num_trials = 10
# Probability of success
p_success = 0.5
# Number of experiments
num_exp = 1000
# Draw sample from the binomial distribution
sample = rng.binomial(num_trials, p_success, size=num_exp)
# See sample
sample

In [None]:
#@title Generation of Random Data: Continuous RV (Array Edition)
import numpy as np

# Create a generator object
rng = np.random.default_rng(seed=42)

# Specify parameters for normal distribution
mean = 3
standard_deviation = 2
sample_size = 20 
# Default values for loc, scale, and size are 0, 1, and None respectively
sample = rng.normal(loc=mean, scale=standard_deviation, size=sample_size)
# Display sample
sample

In [None]:
#@title Generation of Random Data: Continuous RV (Matrix Edition)
import numpy as np

# Create a generator object
rng = np.random.default_rng(seed=42)

# Specify n-dimensional array of samples
matrix_size = (2,4)
sample = rng.normal(loc=mean, scale=standard_deviation, size=matrix_size) # 2 x 4 array
# Display sample
sample

In [None]:
#@title Generation of Random Data: Continuous RV (Multiple Means)
import numpy as np

# Create a generator object
rng = np.random.default_rng(seed=42)

# loc parameter can take an array of values: [mean1, mean2] for this example
loc_list = [0,5]
# Similarly, scale parameter can take an array of values: [std_dev1, std_dev2]
deviation_list = [1,2]
# size parameters has to match with the array size in the previous parameters
# so, in this example column size (2) matches with array size (2)
row = 5
column = 2
# Generate sample
sample = rng.normal(loc=loc_list, scale=deviation_list, size=(row, column))
# Display sample
sample

# Arithmetics with Dataframe

In [None]:
#@title Adding Two Columns 
data["X"].add(data["X1"]).head(5) 

In [None]:
#@title Subtracting One Column from Another
data["X"].subtract(data["X1"]).head(5)

In [None]:
#@title Multiplying Two Columns 
data["X"].multiply(data["X1"]).head(5)

In [None]:
#@title Dividing One Column from Another
data["X"].divide(data["X1"]).head(5)

In [None]:
#@title Summing All Values in Columns with Numeric Values 
data.sum(numeric_only=True)

In [None]:
#@title Summing All Values for a Specified Column 
data["X"].sum()

In [None]:
#@title Exponentiate a Specified Column 
data["X"].pow(2)

In [None]:
#@title Exponentiate Multiple Numeric Columns in Dataset
data.iloc[ 1: , 1:].pow(2) # exclude column Y which has string data type

In [None]:
#@title Calculate Roots
# below we perform square root 
data.iloc[ 1: , 1:].pow(1/2) 

In [None]:
#@title Create New Column on Dataframe
data["X6"] = data['X'].pow(2)

In [None]:
#@title Rounding for One Dataframe Column
data["X6"].round(1)

In [None]:
#@title Rounding for the Whole Dataframe
data.round(1)

In [None]:
#@title Find Missing Values
bool_series = pd.isnull(data["X"])
data[bool_series]

# below result may or may not show missing data

In [None]:
#@title Sorting Dataframe 
# This is not an in-place sort
# We can assign the result dataframe 
# to the same variable to reflect the change
# data = data.sort_values(by=["Y"], ascending=True)
data.sort_values(by=["Y"], ascending=True)

In [None]:
#@title Selecting Cases in Dataframe
data[data["X"] > 70]

In [None]:
#@title Exclude a participant number from the cases
data[(data["X"] > 70) & (data["X"] != 75)]

# Descriptive Statistics

In [None]:
#@title Generate Descriptive Statistics
data.describe()

In [None]:
#@title Measures of Central Tendency with Missing Data

# Tested with Cancer cases data
# X4 has missing data
print("Mean: ", data["X4"].mean())
print("Median: ", data["X4"].median())
print("Mode: ", data["X4"].mode()) 

# For cancer cases dataset, no mode exists in column X4
# No warnings or errors when performing central tendency measurements

In [None]:
#@title Mean
data.mean()

In [None]:
#@title Median
data.median()

In [None]:
#@title Mode
data.mode()

# below result may show the dataframe
# due to the fact that this particular
# dataframe has no mode

In [None]:
#@title Minimum
data.min() # find minimum in each column

# data["X"].min() # minimum in column X

In [None]:
#@title Maximum 
data.max() # find maximum in each column

# data["X"].max maximum in column X

In [None]:
#@title Range
data["X"].max() - data["X"].min()

In [None]:
#@title First Quartile
first_quartile = data.describe()["X"]["25%"] # get the first quartile from dataframe

In [None]:
#@title Second Quartile
second_quartile = data.describe()["X"]["50%"] # get the 2nd quartile from dataframe

In [None]:
#@title Third quartile
third_quartile = data.describe()["X"]["75%"] # get the 3rd quartile 


In [None]:
#@title Interquartile Range
third_quartile - first_quartile

In [None]:
#@title Variance
data["X"].var()

In [None]:
#@title Standard Deviation
data["X"].std()

In [None]:
#@title Skewness in One Column
data["X"].skew()

In [None]:
#@title Measurement of Skewness for All Columns 
data.skew(axis=0, numeric_only=True)

In [None]:
#@title Measurement of Kurtosis
data.kurtosis(axis=0, numeric_only=True)

In [None]:
#@title Generate Frequency Table 
import numpy as np

# Test data: Wine Quality
# here, we have 2 ndarrays: one is a set of values, and 
# the other indicates the occurences of each value
unique_vals, occurrences = np.unique(data['Y'], return_counts=True)

# below is python dictionary object whose keys are names of columns 
# and values are the pandas series above
freq_dist_dict = {
    "Value": pd.Series(unique_vals),
    "Frequency": pd.Series(occurrences),
} 

freq_table = pd.DataFrame(freq_dist_dict) # frequency distribution table

freq_table # display data

In [None]:
#@title Generate Frequency Table with Relative Frequency
import numpy as np

# below is our function to calculate 
# the percentage of the frequencies 
# divide each count by total number of counts multiplied by 100
def calculate_percentage(counts, total):
  percentages = [] # result list to be returned
  
  for each in counts: # iterate through counts array
    percentages.append((each/total)*100) # calculate percentage and append it to our list
  
  return percentages

# Test data: Wine Quality
# here, we have 2 ndarrays: one is a set of values, and 
# the other indicates the occurences of each value
unique_vals, occurrences = np.unique(data['Y'], return_counts=True)

# find relative frequency using customized function, calculate_percentage
rel_freq = calculate_percentage(occurrences, len(data['Y']))

# pandas series objects for each column in dataframe to be created later
s1 = pd.Series(unique_vals)
s2 = pd.Series(occurrences)
s3 = pd.Series(rel_freq)

# below is python dictionary object whose keys are names of columns 
# and values are the pandas series above
freq_dist_dict = {
    "Value": s1,
    "Frequency": s2,
    "Percent": s3,
} 

freq_table_w_rel_freq = pd.DataFrame(freq_dist_dict) # frequency distribution table

In [None]:
#@title Calculate Intervals Around Mean

# Find mean of data in column 'x'
mean = data['x'].mean()
# Find standard deviation from data in column 'x'
sdev = data['x'].std()
# Get user input on number of standard deviation
num_sd = int(input("Number of standard deviation away from mean: "))
# Display interval around mean
(mean - num_sd * sdev, mean + num_sd * sdev)

In [None]:
#@title Calculate Intervals Around Mean By Using Numpy to Find Standard Deviation

mean = data["x"].mean()
# Find the population standard deviation
sdev = np.std(data["x"], ddof=1) 
# Set to one standard deviation
(mean - 1 * sdev, mean + 1 * sdev) 

In [None]:
#@title Calculate Intervals Around Mean (Numpy and User Input)

mean = data["x"].mean()
# Find the population standard deviation
sdev = np.std(data["x"], ddof=1) 
# Get user input on number of standard deviation
num_sd = int(input("Number of standard deviation away from mean: "))
# Set to the number of standard deviation set by user
(mean - num_sd * sdev, mean + num_sd * sdev)

In [None]:
#@title Determine Population Variance

# Verify that data['x'] has no NA before doing calculation
data['x'].var() * (len(data['x']) - 1) / len(data['x'])

In [None]:
#@title Determine Population Standard Deviation
from math import sqrt

# Verify that data['x'] has no NA before doing calculation
sqrt(data['x'].var() * (len(data['x']) - 1) / len(data['x']))

In [None]:
#@title Add an Outlier Score

# Create a copy of data from column 'x'
series_x = data["x"] 
# Create a series from a list of value(s); 
# create a new single score equal to three times the mean in the list
series_of_new_scores = pd.Series([data["x"].mean() * 3])  
# Create a new series by concatenating two pandas series
series_with_outlier = pd.concat([series_x, series_of_new_scores], ignore_index=True) 
# Display the mean of the new variable
series_with_outlier.mean() 

# verify that a new value has been added to our new series
# series_with_outlier.tail(3) # output the last three values in series

In [None]:
#@title Add an Outlier Score (Ver. 2)

# Create a copy of column 'x'
with_outlier = data["x"] 
# Create a new single score equal to three times the mean
new_score = data["x"].mean() * 3 
# Add the outlier to our copy
with_outlier.append(pd.Series(new_score))
# Show the mean of the new variable
with_outlier.mean() 