# Statistische Grundlagen

In [29]:
%matplotlib inline
%config InlineBackend.figure_formats = ['svg']
%config InlineBackend.rc = {'figure.figsize': (5.0, 3.0)}

In [30]:
import numpy as np
import pandas as pd

def convert_to_float(s):
    try:
        return float(s)
    except ValueError:
        return np.nan

df = pd.read_csv("./data/SF-Salaries/Salaries.csv.bz2", 
                 converters = {'BasePay': convert_to_float,
                              'OvertimePay': convert_to_float,
                              'OtherPay': convert_to_float,
                              'Benefits': convert_to_float},
                 dtype = {'Status': str})

df.head()

Unnamed: 0,Id,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Agency,Status
0,1,NATHANIEL FORD,GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY,167411.18,0.0,400184.25,,567595.43,567595.43,2011,,San Francisco,
1,2,GARY JIMENEZ,CAPTAIN III (POLICE DEPARTMENT),155966.02,245131.88,137811.38,,538909.28,538909.28,2011,,San Francisco,
2,3,ALBERT PARDINI,CAPTAIN III (POLICE DEPARTMENT),212739.13,106088.18,16452.6,,335279.91,335279.91,2011,,San Francisco,
3,4,CHRISTOPHER CHONG,WIRE ROPE CABLE MAINTENANCE MECHANIC,77916.0,56120.71,198306.9,,332343.61,332343.61,2011,,San Francisco,
4,5,PATRICK GARDNER,"DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)",134401.6,9737.0,182234.59,,326373.19,326373.19,2011,,San Francisco,


In [31]:
print(df["BasePay"].min())

print(df.sort_values("BasePay").head())

-166.01
          Id     EmployeeName      JobTitle  BasePay  OvertimePay  OtherPay  \
72832  72833   Irwin Sidharta  Junior Clerk  -166.01       249.02       0.0   
72865  72866     Robert Scott  Junior Clerk  -121.63       182.70       0.0   
72872  72873  Chung Huey Kung  Junior Clerk  -109.22       163.83       0.0   
72874  72875        Jordan Li  Junior Clerk  -106.60       159.90       0.0   
72878  72879  Richard Jackson  Junior Clerk  -101.88       153.08       0.0   

       Benefits  TotalPay  TotalPayBenefits  Year  Notes         Agency Status  
72832      6.56     83.01             89.57  2012    NaN  San Francisco    NaN  
72865      5.44     61.07             66.51  2012    NaN  San Francisco    NaN  
72872      4.32     54.61             58.93  2012    NaN  San Francisco    NaN  
72874      4.66     53.30             57.96  2012    NaN  San Francisco    NaN  
72878      4.55     51.20             55.75  2012    NaN  San Francisco    NaN  


In [32]:
print(df["BasePay"].max())
print(df.sort_values("BasePay", ascending=False).head())

319275.01
            Id          EmployeeName                      JobTitle    BasePay  \
72925    72926        Gregory P Suhr               Chief of Police  319275.01   
110532  110533            Amy P Hart             Asst Med Examiner  318835.49   
72929    72930         Robert L Shaw  Dep Dir for Investments, Ret  315572.01   
72926    72927  Joanne M Hayes-White        Chief, Fire Department  313686.01   
72931    72932     Harlan L Kelly-Jr   Executive Contract Employee  313312.52   

        OvertimePay  OtherPay  Benefits   TotalPay  TotalPayBenefits  Year  \
72925          0.00  20007.06  86533.21  339282.07         425815.28  2013   
110532     10712.95  60563.54  89540.23  390111.98         479652.21  2014   
72929          0.00      0.00  82849.66  315572.01         398421.67  2013   
72926          0.00  23236.00  85431.39  336922.01         422353.40  2013   
72931          0.00      0.00  82319.51  313312.52         395632.03  2013   

        Notes         Agency Statu

In [33]:
print(df["BasePay"].mean())
print(df["BasePay"].median())

66325.4488404877
65007.45


In [34]:
print(np.mean([50000, 50000, 50000, 50000, 1000000]))

print(np.median([50000, 50000, 50000, 50000, 1000000]))

240000.0
50000.0


In [35]:
print(np.mean([0, 50000, 100000]))
print(np.median([0, 50000, 100000]))   


50000.0
50000.0


In [36]:
print(df["BasePay"].quantile(0)) # = min
print(df["BasePay"].quantile(1))  # = max

print(df["BasePay"].quantile(0.25))  # 1. Quartil
print(df["BasePay"].quantile(0.5))  # 2. Quartil = Median
print(df["BasePay"].quantile(0.75))  # 3. Quartil
print(df["BasePay"].quantile(0.9))  # 90%-Quantil

-166.01
319275.01
33588.2
65007.45
94691.05
121036.44600000007


In [37]:
print(df["BasePay"].describe()) # Statistische Grundwerte
print(df["BasePay"].var()) # Varianz
print(df["BasePay"].std()) # Standardabweichung

count    148045.000000
mean      66325.448840
std       42764.635495
min        -166.010000
25%       33588.200000
50%       65007.450000
75%       94691.050000
max      319275.010000
Name: BasePay, dtype: float64
1828814049.0424156
42764.63549525958


## Varianz
$$
s^2 = \frac{1}{n - 1} \sum_{i=1}^{n} (x_i - \bar{x})^2
$$

In [38]:
x_strich = df["BasePay"].mean()

np.mean((df["BasePay"] - x_strich) ** 2) # Rundungsfehler im Vergleich zur optimierten np.var()

np.float64(1828801695.946681)

In [39]:
np.var([10, 10, 10, 10, 10]) # Varianz

np.float64(0.0)

In [40]:
np.var([0, 0, 10, 20, 20])

np.float64(80.0)

In [41]:
# Wurzel der Varianz = Standardabweichung
np.std([10, 10, 10, 10, 10]) # Standardabweichung

np.float64(0.0)

In [42]:
np.std([0, 0, 10, 20, 20]) # Standardabweichung

np.float64(8.94427190999916)