In [2]:
import statsmodels

import numpy as np
import scipy as sc
from matplotlib import pyplot as plt

from statsmodels.stats.proportion import proportion_confint, proportions_ztest, \
confint_proportions_2indep, test_proportions_2indep

In [3]:
import numpy as np
import pandas as pd
import scipy as sc
%pylab inline

from statsmodels.stats.weightstats import DescrStatsW, CompareMeans

Populating the interactive namespace from numpy and matplotlib


In [4]:
print("Wald's interval:", confint_proportions_2indep(189, 11034, 104, 11037, method = 'wald', compare='diff'))

Wald's interval: (0.004687750675049438, 0.010724297276960126)


In [5]:
test_proportions_2indep(189, 11034, 104, 11037, alternative = 'two-sided', method = 'score')

<class 'statsmodels.stats.base.HolderTuple'>
statistic = 5.00127490046852
pvalue = 5.69524362691927e-07
compare = diff
method = score
variance = 2.374101369219409e-06
alternative = two-sided
prop1_null = 0.013275338679715464
prop2_null = 0.013275338679715464
tuple = (5.00127490046852, 5.69524362691927e-07)
diff = 0.0077060239760047815
ratio = 1.8178017944535074
odds_ratio = 1.8320539419087138
value = 0

In [6]:
x = (189/11034)/ (1-189/11034) / (  (104/11037) / (1-104/11037))
x

1.8320539419087138

In [7]:
def get_bootstrap_samples(x, n_resamples):
    indices = np.random.randint(0, len(x), (n_resamples, len(x)))
    resamples = x[indices]
    return resamples

In [8]:
def percentile_interval(stat, alpha):
    boundaries = np.percentile(stat, [100 * alpha / 2., 100 * (1 - alpha / 2.)])
    return boundaries

In [9]:
sample1 = np.array([1] * 104 + [0] * 10933)
sample2 = np.array([1] * 189 + [0] * 10845)

In [10]:
np.random.seed(0)

sample1_scores = list(map(np.median, get_bootstrap_samples(sample1, 1000)))
sample2_scores = list(map(np.median, get_bootstrap_samples(sample2, 1000)))

print("95% confidence interval for sample1", percentile_interval(sample1_scores, x))
print("95% confidence interval for sample2", percentile_interval(sample2_scores, x))

95% confidence interval for sample1 [0. 0.]
95% confidence interval for sample2 [0. 0.]


In [11]:
z = (9.57 - 9.5) / (0.4 / np.sqrt(160))
p = 2*(1-statsmodels.stats.norm.cdf(abs(z)))

AttributeError: module 'statsmodels.stats' has no attribute 'norm'

In [34]:
data = pd.read_csv('data/diamond_prices.csv', sep = ',', header = 0)

In [35]:
data.head()

Unnamed: 0,price,predicted_price_lm,predicted_price_gbdt
0,564,747.149466,799.049243
1,5914,6644.569397,6635.491541
2,2562,2096.573114,2138.584133
3,537,727.031366,694.10771
4,5964,7180.988674,7425.35356


In [36]:
lm_val = 0
gbdt_val = 0

for index, row in data.iterrows():
    lm_val += abs(row['price'] - row['predicted_price_lm'])
    gbdt_val += abs(row['price'] - row['predicted_price_gbdt'])

lm_result = lm_val / data.price.count()
gbdt_result = gbdt_val / data.price.count()

In [37]:
abs(lm_result - gbdt_result)

6.875650288841371

In [38]:
abs(data['price'] - data['predicted_price_lm']).mean() - abs(data['price'] - data['predicted_price_gbdt']).mean()

6.875650288835004

In [39]:
data['err'] = abs(data['price'] - data['predicted_price_lm']) - abs(data['price'] - data['predicted_price_gbdt'])

In [40]:
data

Unnamed: 0,price,predicted_price_lm,predicted_price_gbdt,err
0,564,747.149466,799.049243,-51.899776
1,5914,6644.569397,6635.491541,9.077855
2,2562,2096.573114,2138.584133,42.011019
3,537,727.031366,694.107710,32.923656
4,5964,7180.988674,7425.353560,-244.364886
...,...,...,...,...
13480,2239,2403.202635,2354.263323,48.939312
13481,1092,908.723195,896.414922,-12.308272
13482,3285,4534.975464,4638.855560,-103.880096
13483,3734,3465.940124,3381.763771,-84.176353


In [41]:
import scipy.stats as st


st.t.interval(alpha=0.95, df=len(data['err'])-1, loc=np.mean(data['err']), scale=st.sem(data['err'])) 

(1.219560753734485, 12.531739823935688)

In [43]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [45]:
data['mean_price_lm'] = mean_absolute_percentage_error(data['price'], data['predicted_price_lm'])
data['mean_price_gbdt'] = mean_absolute_percentage_error(data['price'], data['predicted_price_gbdt'])

data

Unnamed: 0,price,predicted_price_lm,predicted_price_gbdt,err,mean_price_lm,mean_price_gbdt
0,564,747.149466,799.049243,-51.899776,19.880733,20.261059
1,5914,6644.569397,6635.491541,9.077855,19.880733,20.261059
2,2562,2096.573114,2138.584133,42.011019,19.880733,20.261059
3,537,727.031366,694.107710,32.923656,19.880733,20.261059
4,5964,7180.988674,7425.353560,-244.364886,19.880733,20.261059
...,...,...,...,...,...,...
13480,2239,2403.202635,2354.263323,48.939312,19.880733,20.261059
13481,1092,908.723195,896.414922,-12.308272,19.880733,20.261059
13482,3285,4534.975464,4638.855560,-103.880096,19.880733,20.261059
13483,3734,3465.940124,3381.763771,-84.176353,19.880733,20.261059


In [46]:
from scipy import stats


print('T test:', sc.stats.ttest_rel(data.predicted_price_lm, data.predicted_price_gbdt))

T test: Ttest_relResult(statistic=-31.378409415754366, pvalue=1.1525972510574402e-208)


In [59]:
cm = CompareMeans(DescrStatsW(data.predicted_price_lm), DescrStatsW(data.predicted_price_gbdt))
print("95%% confidence interval: [%f, %f]" % cm.tconfint_diff(usevar='unequal'))

95% confidence interval: [-202.866133, -27.763321]


In [31]:
DescrStatsW(abs(res['price'] - res['predicted_price_lm']) - \
            abs(res['price'] - res['predicted_price_gbdt'])).tconfint_mean()

NameError: name 'res' is not defined