# Basic Statistics

- Random data generation
- Summary and descriptive statistics
- Sample covariance and correlation
- Cross tabulation
- Frequent items
- Visualization

In [None]:
# import libraries
from pyspark.sql.functions import rand, randn
from pyspark.sql import functions as F

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# import libraries
from pyspark.sql.functions import mean, min, max
from pyspark.sql.functions import struct

#### Random Data Generation
We start with some simple methods for generating columns that contains independently and identically distributed values drawn from a distribution, e.g., uniform (rand), and standard normal (randn).

In [None]:
# Create a DataFrame with one int column and 10 rows.
df = sqlContext.range(0, 10)
df.show()

In [None]:
# Generate two other columns using uniform distribution and normal distribution.
df.select("id", rand(seed=10).alias("uniform"), randn(seed=27).alias("normal")).show()

In [None]:
df.printSchema()

In [None]:
# plot bar graph of data; id column on x-axis and uniform column on y-axis
plt.clf()
pdDF = df.toPandas()
pdDF.plot(x='id', y='uniform', kind='bar', rot=45)
display()

### What went wrong?

In [None]:
# check the table we tried to plot
pdDF

We see that the original dataframe just consists of one column; id. 

But what we are interested in is the columns generated from a uniform distribution and a normal distribution.

In [None]:
# change the dataframe so that it includes these two columns (before we generated them but didn't add them to the dataframe)
df = df.select("id", rand(seed=10).alias("uniform"), randn(seed=27).alias("normal"))

In [None]:
# Change spark dataframe to pandas dataframe
pdDF = df.toPandas()

In [None]:
# plot bar graph of data; id column on x-axis and uniform column on y-axis
plt.clf()
pdDF.plot(x='id', y='uniform', kind='bar', rot=45)
plt.show()

In [None]:
# plot histogram of data; frequency of the unifrom distribution
plt.clf()
pdDF.plot(x='id', y='uniform', kind='hist')
plt.show()

A histogram shows the frequency of data items in successive numerical intervals of equal size. 
The independent variable (the y-values in the bar graph above) is plotted along the horizontal axis and the dependent variable (the frequency of the occurences) is plotted along the vertical axis.

## Understanding the Data: Descriptive Statistics
The first operation to perform after importing data is to get some sense of what the data looks like. The function <b>describe</b> returns information such as number of non-null entries (count), mean, standard deviation, and minimum and maximum value for each numerical column.

In [None]:
df.show()

In [None]:
# print column names
df.columns

In [None]:
# show column names and types
df.describe()

# show specific column names and types
# display(df.describe('uniform', 'normal'))

Compute the count, mean, standard deviation, min and max values from a DataFrame: use the describe() method

In [None]:
# overview of data frame
df.describe().show()

# overview of specific columns
# df.describe('uniform', 'normal').show()

#### Quantile
we will find the minimum, median and maximum from the uniform column 

In [None]:
df_pandas = df.toPandas()

In [None]:
df_pandas.uniform.mean()   # Same as df['uniform'].mean()

df_pandas.uniform.median() 

# You can call `quantile(i)` to get the i'th quantile,
# where `i` should be a fractional number.

df_pandas.uniform.quantile(0.1) # 10th percentile

df_pandas.uniform.quantile(0.5) # same as median

df_pandas.uniform.quantile(0.9) # 90th percentile

### Sample covariance and correlation

In [None]:
df.toPandas()

In [None]:
'covariance = ' + str(df.stat.cov('uniform', 'normal'))

In [None]:
'covariance = ' + str(df.stat.cov('id', 'id'))

Correlation is a normalized measure of covariance that is easier to understand, as it provides quantitative measurements of the statistical dependence between two random variables.

In [None]:
print 'correlation = ' + str(df.stat.corr('uniform', 'normal'))

print 'correlation = ' + str(df.stat.corr('id', 'id'))

### Cross Tabulation (Contingency Table)
Cross Tabulation provides a table of the frequency distribution for a set of variables. It is used to observe the statistical significance (or independence) of variables.

In [None]:
# Create a DataFrame with two columns (name, item)
names = ["Alice", "Bob", "Mike"]
items = ["milk", "bread", "butter", "apples", "oranges"]
df = sqlContext.createDataFrame([(names[i % 3], items[i % 5]) for i in range(100)], ["name", "item"])

# Take a look at the first 10 rows.
df.show(10)

In [None]:
df.stat.crosstab("name", "item").show()

### Frequent Items
Figuring out which items are frequent in each column can be very useful to understand a dataset.

In [None]:
# Let's find the items that show up 40% of the time for each column:
df.stat.freqItems(["item"], 0.4).show(1, False)

# Plot graphs with matplotlib

In [None]:
# import libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

#### Basic Plotting

In [None]:
# linear graph
plt.clf
plt.plot([1,2,3,4])
plt.ylabel('some numbers')
plt.show()

In [None]:
plt.plot([1, 2, 3, 4], [1, 4, 9, 16])
plt.plot([1,2,3,4], [1,4,9,16], 'ro')
plt.axis([0, 6, 0, 20])
plt.show()

### Working with multiple figures and axes

In [None]:
x1 = np.linspace(0.0, 5.0)
x2 = np.linspace(0.0, 2.0)

y1 = np.cos(2 * np.pi * x1) * np.exp(-x1)
y2 = np.cos(2 * np.pi * x2)

plt.subplot(2, 1, 1) # the first subplot
plt.plot(x1, y1, 'o-')
plt.title('A tale of 2 subplots')
plt.ylabel('Damped oscillation')

plt.subplot(2, 1, 2) # the second subplot
plt.plot(x2, y2, '.-')
plt.xlabel('time (s)')
plt.ylabel('Undamped')

plt.show()

In [None]:
# Data for plotting
t = np.arange(0.0, 2.0, 0.01)
s = 1 + np.sin(2 * np.pi * t)

fig, ax = plt.subplots()
ax.plot(t, s)

ax.set(xlabel='time (s)', ylabel='voltage (mV)',
       title='About as simple as it gets, folks')
ax.grid()

plt.annotate('local max', xy=(1.25, 2), xytext=(3, 1.5),
            arrowprops=dict(facecolor='black', shrink=0.05),
            )

fig.savefig("test.png")
plt.show()

In [None]:
# Let's create pre-defined labels and a modified legend
a = b = np.arange(0, 3, .02)
c = np.exp(a)
d = c[::-1]

# Create plots with pre-defined labels.
fig, ax = plt.subplots()
ax.plot(a, c, 'k--', label='Model length')
ax.plot(a, d, 'k:', label='Data length')
ax.plot(a, c + d, 'k', label='Total message length')

legend = ax.legend(loc='upper center', shadow=True, fontsize='x-large')

# Put a nicer background color on the legend.
legend.get_frame().set_facecolor('C0')

plt.show()

### Bar graphs

In [None]:
# bar plot with multiple data sets
plt.clf
df2 = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd'])
df2.plot.bar()
plt.show()

### Histogram

Demo of the histogram (hist) function with a few features such as setting the number of data bins.

In [None]:
x = np.random.normal(size = 1000)
plt.hist(x, normed=True, bins=30)
plt.ylabel('Probability');
plt.xlabel('Numbers from a normal distribution');
plt.show()

### Pie chart

In [None]:
# Pie chart, where the slices will be ordered and plotted counter-clockwise:
labels = 'Frogs', 'Hogs', 'Dogs', 'Logs'
sizes = [15, 30, 45, 10]
explode = (0, 0.1, 0, 0)  # only "explode" the 2nd slice (i.e. 'Hogs')

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

# Some fun Jupyter commands to start with

In [None]:
%magic
%lsmagic

### Try out the magic cells

In [None]:
%%latex
\begin{equation}
   E = mc^2
\end{equation}

In [None]:
%%html
<HTML>
    <HEAD>
        <TITLE>Your Title Here</TITLE>
    </HEAD>
<BODY BGCOLOR="FFFFFF">
    <HR>
        <a href="http://somegreatsite.com">Link Name</a> is a link to another nifty site
        <H1>This is a Header</H1>
        <H2>This is a Medium Header</H2>
        Send me a mail at <a href="mailto:anna.baecklund@swedbank.se">
        anna.baecklund@swedbank.se</a>.
        <P> This is a new paragraph!
        <P> <B>This is a new paragraph!</B>
        <BR> <B><I>This is a new sentence without a paragraph break, in bold italics.</I></B>
    <HR>
</BODY>
</HTML>

### Function example

In [None]:
import random
min = 1
max = 6

roll_again = "yes"

while roll_again == "yes" or roll_again == "y":
    print "Rolling the dices..."
    print "The values are...."
    print random.randint(min, max)
    print random.randint(min, max)

    roll_again = raw_input("Roll the dices again? ")