# 1.1 - Introduction to Exploratory Data Analysis

In [None]:
import pandas as pd 
df_swing = pd.read_csv('2008_swing_states.csv')
df_swing[['state', 'county', 'dem_share']]

# 1.2 - Plotting a histogram

#### > Generating a histogram

In [None]:
import matplotlib.pyplot as plt
_ = plt.hist(df_swing['dem_share'])
_ = plt.xlabel('percent of vote for Obama')
_ = plt.ylabel('number of counties')
plt.show()

#### > Setting the bins of a histogram

In [None]:
 bin_edges = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
 _ = plt.hist(df_swing['dem_share'], bins=bin_edges)
plt.show()

In [None]:
_ = plt.hist(df_swing['dem_share'], bins=20)
plt.show()

#### > Setting Seaborn styling

In [None]:
import seaborn as sns
sns.set()
_ = plt.hist(df_swing['dem_share'])
_ = plt.xlabel('percent of vote for Obama')
_ = plt.ylabel('number of counties')
plt.show() 

# 1.3 - Plot all of your data: Bee swarm plots

#### > Generating a bee swarm plot

In [None]:
_ = sns.swarmplot(x='state', y='dem_share', data=df_swing)
_ = plt.xlabel('state')
_ = plt.ylabel('percent of vote for Obama')
plt.show()

# 1.4 - Plot all of your data: ECDFs

In [None]:
import numpy as np
x = np.sort(df_swing['dem_share'])
y = np.arange(1, len(x)+1) / len(x)
_ = plt.plot(x, y, marker='.', linestyle='none')
_ = plt.xlabel('percent of vote for Obama')
_ = plt.ylabel('ECDF')
plt.margins(0.02) # Keeps data off plot edges
plt.show()

# 1.5 - Onward toward the whole story!

# 2.1 - Introduction to summary statistics:The sample mean and median

#### > Mean vote percentage

In [3]:
import numpy as np
np.mean(dem_share_PA)
45.476417910447765

NameError: name 'dem_share_PA' is not defined

#### > Computing the median

In [None]:
np.median(dem_share_UT)
22.469999999999999

# 2.2 - Percentiles,outliers,and box plots

#### > Computing percentiles

In [None]:
np.percentile(df_swing['dem_share'], [25, 50, 75])
array([ 37.3025, 43.185 , 49.925 ])

#### > Generating a box plot

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
_ = sns.boxplot(x='east_west', y='dem_share', data=df_all_states)
_ = plt.xlabel('region')
_ = plt.ylabel('percent of vote for Obama')
plt.show()

# 2.3 - Variance and standard deviation

#### > Computing the variance

In [None]:
np.var(dem_share_FL)
147.44278618846064

#### > Computing the standard deviation

In [None]:
np.std(dem_share_FL)
12.142602117687158
np.sqrt(np.var(dem_share_FL))
12.142602117687158

# 2.4 - Covariance and the Pearson correlation coefficient

#### > Generating a scatter plot

In [None]:
_ = plt.plot(total_votes/1000, dem_share,marker='.', linestyle='none')
_ = plt.xlabel('total votes (thousands)')
_ = plt.ylabel('percent of vote for Obama')

# 3.1 - Probabilistic logic and statistical inference

# 3.2 - Random number generators and hacker statistics

#### > Simulating 4 coin flips

In [1]:
import numpy as np
np.random.seed(42)
random_numbers = np.random.random(size=4)
random_numbers
#array([ 0.37454012, 0.95071431, 0.73199394, 0.59865848])
heads = random_numbers < 0.5
heads
#array([ True, False, False, False], dtype=bool)
np.sum(heads)
#1

1

In [2]:
n_all_heads = 0 # Initialize number of 4-heads trials
for _ in range(10000):
    heads = np.random.random(size=4) < 0.5
    n_heads = np.sum(heads)
    if n_heads == 4:
        n_all_heads += 1

n_all_heads / 10000
#0.0621

0.0619

# 3.3 - Probability distributions and stories: The Binomial distribution

#### > Sampling from the Binomial distribution

In [4]:
np.random.binomial(4, 0.5)
#2
np.random.binomial(4, 0.5, size=10)
#array([4, 3, 2, 1, 1, 0, 3, 2, 3, 0])

array([1, 1, 2, 2, 1, 3, 2, 3, 3, 1])

#### > The Binomial PMF

In [5]:
samples = np.random.binomial(60, 0.1, size=10000)

#### > The Binomial CDF

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
sns.set()
x, y = ecdf(samples)
_ = plt.plot(x, y, marker='.', linestyle='none')
plt.margins(0.02)
_ = plt.xlabel('number of successes')
_ = plt.ylabel('CDF')
plt.show()

# 3.4 - Poisson processes and the Poisson distribution

#### > The Poisson CDF

In [None]:
samples = np.random.poisson(6, size=10000)
x, y = ecdf(samples)
_ = plt.plot(x, y, marker='.', linestyle='none')
plt.margins(0.02)
_ = plt.xlabel('number of successes')
_ = plt.ylabel('CDF')
plt.show()

# 4.1 - Probability density functions

# 4.2 - Introduction to the Normal distribution

#### > Checking Normality of Michelson data

In [None]:
import numpy as np
mean = np.mean(michelson_speed_of_light)
std = np.std(michelson_speed_of_light)
samples = np.random.normal(mean, std, size=10000)
x, y = ecdf(michelson_speed_of_light)
x_theor, y_theor = ecdf(samples) 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
_ = plt.plot(x_theor, y_theor)
_ = plt.plot(x, y, marker='.', linestyle='none')
_ = plt.xlabel('speed of light (km/s)')
_ = plt.ylabel('CDF')
plt.show()

# 4.3 - The Normal distribution: Properties and warnings

# 4.4 - The Exponential distribution

#### > Exponential inter-incident times

In [None]:
mean = np.mean(inter_times)
samples = np.random.exponential(mean, size=10000)
x, y = ecdf(inter_times)
x_theor, y_theor = ecdf(samples)
_ = plt.plot(x_theor, y_theor)
_ = plt.plot(x, y, marker='.', linestyle='none')
_ = plt.xlabel('time (days)')
_ = plt.ylabel('CDF')
plt.show()