## Computing Descriptive Statistics in Python Scipy library

# Using Python Scipy library to compute various measures of center of the data

In [4]:
import pandas as pd
import numpy as np
import scipy.stats as sp
from scipy.stats import trim_mean,kurtosis
from scipy.stats.mstats import gmean,hmean

In [5]:
df6= pd.read_csv("C:\\Users\\HP\\Downloads\\InfantMortalityGNI.csv",encoding='latin-1')

In [6]:
df6.head()

Unnamed: 0,Countries,GNI_PER_CAPITA,Infant_Mortality
0,Afghanistan,590.0,73.2
1,Albania,4290.0,14.0
2,Algeria,4800.0,25.5
3,Andorra,,2.8
4,Angola,4040.0,86.5


In [7]:
df6.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197 entries, 0 to 196
Data columns (total 3 columns):
Countries           197 non-null object
GNI_PER_CAPITA      185 non-null float64
Infant_Mortality    193 non-null float64
dtypes: float64(2), object(1)
memory usage: 4.7+ KB


# The mean function is deprecated in scipy. Recommendation to use numpy.mean
### Computing the geometric mean
#### The geometric mean is used for data that combine multiplicatively, like rates. used in population growth estimates and economics $$\sqrt[n]{{{x}_{1}}\bullet {{x}_{2}}\bullet {{x}_{3}}\bullet \bullet \bullet {{x}_{n}}}\,\,\,-\,\,\,1$$
#### The monthly rate of return of Alpha Business for the last three months has been 11.5, 2.7 and 4.8 percent. What is the average rate of return?

In [9]:
GeoMean= gmean([1.115,1.027,1.048])
print("The geomatric mean value is: ",round(GeoMean,3))

The geomatric mean value is:  1.063


In [10]:
Average_Rate_Return= (GeoMean-1)*100
print("THe average rate of return is: ",round(Average_Rate_Return,1))

THe average rate of return is:  6.3


## Computing the harmonic mean

###  $$\frac{n}{\frac{1}{{{x}_{1}}}+\frac{1}{{{x}_{2}}}+\frac{1}{{{x}_{3}}}+....+\frac{1}{{{x}_{n}}}}$$

## In a three separate download of a 600kb file,
    the averages rates of speed were measured as follows: 
    4.5Kb per second, 4.1Kb per second and 4.9 Kb per second. 
    Calculate the average rate of download.

In [11]:
Rates= [4.5,4.1,4.9]

In [12]:
Meannp= np.mean(Rates)

In [13]:
Meannp

4.5

In [14]:
HarmonicRate= hmean(Rates)

In [15]:
print("The harmonic rate is: ", round(HarmonicRate,2))

The harmonic rate is:  4.48


In [17]:
df6.head()

Unnamed: 0,Countries,GNI_PER_CAPITA,Infant_Mortality
0,Afghanistan,590.0,73.2
1,Albania,4290.0,14.0
2,Algeria,4800.0,25.5
3,Andorra,,2.8
4,Angola,4040.0,86.5


In [19]:
skewness= sp.skew(df6['Infant_Mortality'],nan_policy= "omit")

In [20]:
print("The skewness coeff is: ", skewness)

The skewness coeff is:  1.2925985426500972


In [23]:
""" if Fisher=True (default)3 is subscribed from the kurtosis. Normal=0.0"""
Kurtosis= sp.kurtosis(df6['Infant_Mortality'],fisher=True, nan_policy='omit')
print("The kurtosis coefficient is: ",round(Kurtosis,2))

The kurtosis coefficient is:  0.89


## Computing the Z score using Python Scipy library

In [24]:
import pandas as pd
import numpy as np
import scipy.stats as sp
from scipy.stats import zscore 

In [25]:
pd.set_option('display.max_rows', 999)

In [32]:
cpi_scores= pd.read_csv("C:\\Users\\HP\\Downloads\\CorruptionPerceptionIndex.csv",encoding='latin-1')

In [33]:
cpi_scores.head()

Unnamed: 0,Country,CPI2016
0,New Zealand,90
1,Denmark,90
2,Finland,89
3,Sweden,88
4,Switzerland,86


In [34]:
cpi_scores.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176 entries, 0 to 175
Data columns (total 2 columns):
Country    176 non-null object
CPI2016    176 non-null int64
dtypes: int64(1), object(1)
memory usage: 2.8+ KB


In [35]:
cpi_scores['zscores']= sp.zscore(cpi_scores['CPI2016']).round(2)

In [36]:
cpi_scores

Unnamed: 0,Country,CPI2016,zscores
0,New Zealand,90,2.43
1,Denmark,90,2.43
2,Finland,89,2.38
3,Sweden,88,2.32
4,Switzerland,86,2.22
5,Norway,85,2.17
6,Singapore,84,2.12
7,Netherlands,83,2.07
8,Canada,82,2.01
9,Germany,81,1.96


## Computing percentiles of scores and IQR using Python Scipy library

In [37]:
import pandas as pd
import numpy as np
import scipy.stats as sp

In [38]:
df8= pd.read_csv("C:\\Users\\HP\\Downloads\\InfantMortalityGNI.csv",encoding='latin-1')

In [39]:
df8.head()

Unnamed: 0,Countries,GNI_PER_CAPITA,Infant_Mortality
0,Afghanistan,590.0,73.2
1,Albania,4290.0,14.0
2,Algeria,4800.0,25.5
3,Andorra,,2.8
4,Angola,4040.0,86.5


## Always need to drop missing values in data or use options provided to drop them

## Using percentileOfScore 
## kind : {‘rank’, ‘weak’, ‘strict’, ‘mean’}, optional
  1. “rank”: Average percentage ranking of score. 
  2. “weak”: This kind corresponds to the definition of a cumulative distribution function.<br>
  3. A percentileofscore of 80% means that 80% of values are less than or equal to the provided score.
  4. “strict”: Similar to “weak”, except that only values that are strictly less than the given score are counted.
  6. “mean”: The average of the “weak” and “strict” scores, often used in

In [40]:
Value=32
x= df8['Infant_Mortality'].dropna()
Percentile_value= sp.percentileofscore(x,Value,kind='weak')
print("The percentile value is",Percentile_value)

The percentile value is 64.7668393782


## Computing Percentile

#### Function will be deprecated in near future use numpy.percentile

In [42]:
sp.scoreatpercentile(df8['Infant_Mortality'].dropna(),75)

49.5

In [43]:
numpy7thp= np.nanpercentile(df8['Infant_Mortality'],75)

In [44]:
numpy7thp

49.5

## Scipy IQR

In [46]:
IQR_Infant= sp.iqr(df8['Infant_Mortality'],nan_policy= 'omit')

In [47]:
print("The IQR for infant mortality is:",IQR_Infant)

The IQR for infant mortality is: 41.3


In [48]:
IQR_GNI_PERCAPITA= sp.iqr(df8['GNI_PER_CAPITA'],nan_policy='omit')

In [49]:
print("The IQR for GNI per Capita is:",IQR_GNI_PERCAPITA)

The IQR for GNI per Capita is: 13310.0


## Computing trimmed statistics using Python 3 scipy statistics library

In [50]:
import pandas as pd
import numpy as np
import scipy.stats as sp
from scipy.stats import trim_mean

## The mean function is deprecated in scipy. Recommendation to use numpy.mean

In [51]:
Scores=[10,80,85,90,95,65]

In [52]:
Meannp= np.mean(Scores)

In [53]:
Meannp

70.833333333333329

## trimmed statistics series

### tmean(a[,limits,inclusive,axis])
### tvar(a[,limits,inclusive,axis,ddof])
### tmin(a[,lowerlimit,axis,inclusive,...])
### tmax(a[,upperlimit,axis,inclusive,...])
### tstd(a[,limits,inclusive,axis,ddof])

## Using the functions without additional arguments

In [55]:
sp.tmean(Scores)

70.833333333333329

In [56]:
sp.tvar(Scores)

994.16666666666663

In [57]:
sp.tstd(Scores)

31.530408602913262

In [58]:
sp.tmin(Scores)

10

In [59]:
sp.tmax(Scores)

95

## Using the functions with arguments

In [60]:
sp.tmean(Scores,(20,80))

72.5

In [61]:
sp.tstd(Scores,(20,80),ddof=1)

10.606601717798213

In [63]:
sp.tmin(Scores,30)

65

In [64]:
sp.tmax(Scores,90)

90

## Trimmed mean using scipy stats

### scipy.stats.trim_mean(a, proportiontocut, axis=0)

### Return mean of array after trimming distribution from both tails.

### If proportiontocut = 0.01, slices off ‘leftmost’ and ‘rightmost’ 1% of scores. The input is sorted before slicing.

In [65]:
df9= pd.read_csv("C:\\Users\\HP\\Downloads\\InfantMortalityGNI.csv",encoding='latin-1')

In [66]:
df9.head()

Unnamed: 0,Countries,GNI_PER_CAPITA,Infant_Mortality
0,Afghanistan,590.0,73.2
1,Albania,4290.0,14.0
2,Algeria,4800.0,25.5
3,Andorra,,2.8
4,Angola,4040.0,86.5


In [67]:
Trimmed_Mean1pct= sp.trim_mean(df9['Infant_Mortality'].dropna(),0.01)

In [68]:
print("The 1% trimmed mean of infant mortality rate is: ",Trimmed_Mean1pct)

The 1% trimmed mean of infant mortality rate is:  31.0298429319


In [69]:
df10= df9['Infant_Mortality'].dropna()

In [71]:
sp.trim_mean(df10,0.01)

31.029842931937171