In [35]:
import os
import sys

spark_path = "C:/stack/spark-1.6.2-bin-hadoop2.6"

os.environ['SPARK_HOME'] = spark_path
os.environ['HADOOP_HOME'] = spark_path

sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.9-src.zip")

from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext

sc = SparkContext("local")
sqlCtx = SQLContext(sc)

In [36]:
sc
sqlCtx

<pyspark.sql.context.SQLContext at 0x19f682a3ba8>

In [38]:
from pyspark.sql.functions import rand, randn
df = sqlCtx.range(0, 10)
df.show()
df.count()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
+---+



10

In [12]:
df.select("id", rand(seed=10).alias("uniform"), randn(seed=27).alias("normal")).show()

+---+-------------------+-------------------+
| id|            uniform|             normal|
+---+-------------------+-------------------+
|  0|0.41371264720975787| 0.5888539012978773|
|  1| 0.7311719281896606| 0.8645537008427937|
|  2| 0.9031701155118229| 1.2524569684217643|
|  3|0.09430205113458567| -2.573636861034734|
|  4|0.38340505276222947| 0.5469737451926588|
|  5| 0.5569246135523511|0.17431283601478723|
|  6| 0.4977441406613893|-0.7040284633147095|
|  7| 0.2076666106201438| 0.4637547571868822|
|  8| 0.9571919406508957|  0.920722532496133|
|  9| 0.7429395461204413|-1.4353459012380192|
+---+-------------------+-------------------+



In [39]:
df = sqlCtx.range(0, 10).withColumn('uniform', rand(seed=10)).withColumn('normal', randn(seed=27))
df.show()

+---+-------------------+-------------------+
| id|            uniform|             normal|
+---+-------------------+-------------------+
|  0|0.41371264720975787| 0.5888539012978773|
|  1| 0.7311719281896606| 0.8645537008427937|
|  2| 0.9031701155118229| 1.2524569684217643|
|  3|0.09430205113458567| -2.573636861034734|
|  4|0.38340505276222947| 0.5469737451926588|
|  5| 0.5569246135523511|0.17431283601478723|
|  6| 0.4977441406613893|-0.7040284633147095|
|  7| 0.2076666106201438| 0.4637547571868822|
|  8| 0.9571919406508957|  0.920722532496133|
|  9| 0.7429395461204413|-1.4353459012380192|
+---+-------------------+-------------------+



In [40]:
df.describe('uniform', 'normal').show()

+-------+-------------------+--------------------+
|summary|            uniform|              normal|
+-------+-------------------+--------------------+
|  count|                 10|                  10|
|   mean| 0.5488228646413278|0.009861721586543392|
| stddev| 0.2856822245344392|  1.2126061129356596|
|    min|0.09430205113458567|  -2.573636861034734|
|    max| 0.9571919406508957|  1.2524569684217643|
+-------+-------------------+--------------------+



In [41]:
from pyspark.sql.functions import mean, min, max
df.select([mean('uniform'), min('uniform'), max('uniform')]).show()
df.select([('uniform'), ('uniform'), ('normal')]).show()

+------------------+-------------------+------------------+
|      avg(uniform)|       min(uniform)|      max(uniform)|
+------------------+-------------------+------------------+
|0.5488228646413278|0.09430205113458567|0.9571919406508957|
+------------------+-------------------+------------------+

+-------------------+-------------------+-------------------+
|            uniform|            uniform|             normal|
+-------------------+-------------------+-------------------+
|0.41371264720975787|0.41371264720975787| 0.5888539012978773|
| 0.7311719281896606| 0.7311719281896606| 0.8645537008427937|
| 0.9031701155118229| 0.9031701155118229| 1.2524569684217643|
|0.09430205113458567|0.09430205113458567| -2.573636861034734|
|0.38340505276222947|0.38340505276222947| 0.5469737451926588|
| 0.5569246135523511| 0.5569246135523511|0.17431283601478723|
| 0.4977441406613893| 0.4977441406613893|-0.7040284633147095|
| 0.2076666106201438| 0.2076666106201438| 0.4637547571868822|
| 0.9571919406508

In [26]:
from pyspark.sql.functions import rand
df = sqlCtx.range(0, 10).withColumn('rand1', rand(seed=10)).withColumn('rand2', rand(seed=27))

In [27]:
df.stat.cov('rand1', 'rand2')

-0.00834685407056579

In [28]:
df.stat.cov('id', 'id')

9.166666666666666

In [29]:
df.stat.corr('rand1', 'rand2')

-0.1099396246708271

In [42]:
df.stat.corr('id', 'id')

1.0

In [46]:
# Create a DataFrame with two columns (name, item)
names = ["Alice", "Bob", "Mike"]
items = ["milk", "bread", "butter", "apples", "oranges"]
df = sqlCtx.createDataFrame([(names[i % 3], items[i % 5]) for i in range(100)], ["name", "item"])

df.show(10)
df.stat.crosstab("name", "item").show()

+-----+-------+
| name|   item|
+-----+-------+
|Alice|   milk|
|  Bob|  bread|
| Mike| butter|
|Alice| apples|
|  Bob|oranges|
| Mike|   milk|
|Alice|  bread|
|  Bob| butter|
| Mike| apples|
|Alice|oranges|
+-----+-------+
only showing top 10 rows

+---------+------+-------+------+----+-----+
|name_item|apples|oranges|butter|milk|bread|
+---------+------+-------+------+----+-----+
|      Bob|     6|      7|     7|   6|    7|
|     Mike|     7|      6|     7|   7|    6|
|    Alice|     7|      7|     6|   7|    7|
+---------+------+-------+------+----+-----+



In [53]:
df = sqlCtx.createDataFrame([(1, 2, 3) if i % 2 == 0 else (i, 2 * i, i % 4) for i in range(100)], ["a", "b", "c"])
df.show(10)

freq = df.stat.freqItems(["a", "b", "c"], 0.4)
freq.collect()[0]

+---+---+---+
|  a|  b|  c|
+---+---+---+
|  1|  2|  3|
|  1|  2|  1|
|  1|  2|  3|
|  3|  6|  3|
|  1|  2|  3|
|  5| 10|  1|
|  1|  2|  3|
|  7| 14|  3|
|  1|  2|  3|
|  9| 18|  1|
+---+---+---+
only showing top 10 rows



Row(a_freqItems=[1, 99], b_freqItems=[2, 198], c_freqItems=[1, 3])

In [56]:
from pyspark.sql.functions import struct

freq = df.withColumn('ab', struct('a', 'b')).stat.freqItems(['ab'], 0.4)
freq.show()

freq.collect()[0]

+-----------------+
|     ab_freqItems|
+-----------------+
|[[99,198], [1,2]]|
+-----------------+



In [59]:
from pyspark.sql.functions import *
df = sqlCtx.range(0, 10).withColumn('uniform', rand(seed=10) * 3.14)
 
# you can reference a column or supply the column name
df.select('uniform', toDegrees('uniform'), (pow(cos(df['uniform']), 2) + pow(sin(df.uniform), 2)).alias("cos^2 + sin^2")).show()

+-------------------+------------------+------------------+
|            uniform|  DEGREES(uniform)|     cos^2 + sin^2|
+-------------------+------------------+------------------+
| 1.2990577122386398| 74.43052425519424|               1.0|
| 2.2958798545155346| 131.5442259328496|               1.0|
|  2.835954162707124|162.48820441567537|0.9999999999999999|
|0.29610844056259905| 16.96576392243732|0.9999999999999999|
| 1.2038918656734006| 68.97792289321647|               1.0|
| 1.7487432865543826|100.19560977140284|               1.0|
| 1.5629166016767624|  89.5485250070077|               1.0|
| 0.6520731573472516|37.361039849767565|               1.0|
| 3.0055826936438126|172.20720332335196|               1.0|
| 2.3328301748181857|133.66132333784805|               1.0|
+-------------------+------------------+------------------+



In [60]:
sc.stop()