In [None]:
# Dataset credit to git@github.com:maxis42/Big-Data-Engineering-Coursera-Yandex.git

In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

sc = SparkContext(conf=SparkConf().setAppName("MyApp").setMaster("local"))

spark = SparkSession \
    .builder \
    .appName("MyApp") \
    .getOrCreate()

In [None]:
taxi_df = spark.read.format('csv').option("header", "True").load('sample10000.csv')

In [None]:
taxi_df.printSchema()

In [None]:
# check how many passengers in the sample paid for their ride with cash.
taxi_df.groupBy('payment_type').count().show()
taxi_df.filter(taxi_df['payment_type'] == 2).count()

In [None]:
# Build a 99% confidence interval for the proportion of cash payers.
# What is its' lower boundary?
cash_payers = taxi_df.filter(taxi_df['payment_type'] == 2)

In [None]:
from pyspark.sql.functions import avg, stddev
cash_payers.select(avg('fare_amount')).show()
mean = cash_payers.agg({'fare_amount':'avg'}).collect()
std = cash_payers.agg({'fare_amount':'stddev'}).collect()
cash_payers.select('fare_amount').toPandas().median()

In [None]:
import numpy as np

def get_bootstrap_samples(data, n_samples):
    indices = np.random.randint(0, len(data), (n_samples, len(data)))
    samples = data[indices]
    return samples

def stat_intervals(stat, alpha):
    boundaries = np.percentile(stat, [100 * alpha / 2., 100 * (1 - alpha / 2.)])
    return boundaries

In [None]:
pd_fare = cash_payers.select('fare_amount').toPandas()
pd_fare_scalars = list(map(float, np.squeeze(pd_fare.values)))
samples = get_bootstrap_samples(np.asarray(pd_fare_scalars), 1000)

In [None]:
median_fare = list(map(np.median, samples))
stat_intervals(median_fare, 0.01)

In [None]:
# credit to https://stackoverflow.com/questions/15033511/compute-a-confidence-interval-from-sample-data

import numpy as np
import scipy.stats

def mean_confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return m, m-h, m+h

In [None]:
mean_confidence_interval(pd_fare_scalars, confidence=0.99)

In [None]:
from statsmodels.stats.api import DescrStatsW
DescrStatsW(pd_fare_scalars).tconfint_mean(alpha=0.01)

In [None]:
import numpy as np, scipy.stats as st
st.t.interval(0.99, len(pd_fare_scalars)-1, loc=np.mean(pd_fare_scalars), scale=st.sem(pd_fare_scalars))

In [None]:
# Use the same sample to estimate the average trip distance in miles.

taxi_df.agg({'trip_distance':'avg'}).collect()

In [None]:
# What is the standard deviation of the estimator from the previous question?
taxi_df.agg({'trip_distance':'std'}).collect()

In [None]:
# Calculate 95% confidence interval for the mean trip distance. What is the upper boundary?
pd_distance = taxi_df.select('trip_distance').toPandas()
pd_distance_scalars = list(map(float, np.squeeze(pd_distance.values)))
samples = get_bootstrap_samples(np.asarray(pd_distance_scalars), 1000)
median_distance = list(map(np.median, samples))
stat_intervals(median_distance, 0.01)

In [None]:
st.t.interval(0.99, len(pd_distance_scalars)-1, loc=np.mean(pd_distance_scalars), scale=st.sem(pd_distance_scalars))

In [None]:
np.median(pd_distance_scalars)