In [2]:
import os
import sys

os.environ["SPARK_HOME"] = "/usr/spark2.4.3"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/local/anaconda/bin/python" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/local/anaconda/bin/python"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

In [3]:
from pyspark import SparkContext, SparkConf
from pyspark.sql.session import SparkSession
conf = SparkConf().setAppName("appName")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [4]:
df = spark.read.format("json").load("file:/home/wilsonsagar8680/tinku/2010-summary.json")

In [5]:
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [6]:
spark.read.format("json").load("file:/home/wilsonsagar8680/tinku/2015-summary.json").schema

StructType(List(StructField(DEST_COUNTRY_NAME,StringType,true),StructField(ORIGIN_COUNTRY_NAME,StringType,true),StructField(count,LongType,true)))

In [7]:
df.columns

['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count']

In [8]:
df.first()

Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=1)

In [9]:
from pyspark.sql import Row
myrow = Row("India", "US", 2)

In [10]:
myrow[0]

'India'

In [11]:
df.createOrReplaceTempView("dftable")

In [12]:
from pyspark.sql.types import StructField, StructType, StringType, LongType
#from pyspark.sql.types.DataType import StringType, LongType
manSchema = StructType([StructField("Country", StringType(), True), StructField("State", StringType(), True), StructField("Cases", LongType(), True)])
myDf = spark.createDataFrame([myrow], manSchema)

In [13]:
myDf.show()

+-------+-----+-----+
|Country|State|Cases|
+-------+-----+-----+
|  India|   US|    2|
+-------+-----+-----+



In [14]:
myDf.select("Country", "State", "Cases").show(1)

+-------+-----+-----+
|Country|State|Cases|
+-------+-----+-----+
|  India|   US|    2|
+-------+-----+-----+



In [15]:
from pyspark.sql.functions import expr, col
myDf.select(col("State").alias("Country")).show()

+-------+
|Country|
+-------+
|     US|
+-------+



In [16]:
df.selectExpr("DEST_COUNTRY_NAME","count as NumFlights").show(10)

+-----------------+----------+
|DEST_COUNTRY_NAME|NumFlights|
+-----------------+----------+
|    United States|         1|
|    United States|       264|
|    United States|        69|
|            Egypt|        24|
|Equatorial Guinea|         1|
|    United States|        25|
|    United States|        54|
|       Costa Rica|       477|
|          Senegal|        29|
|    United States|        44|
+-----------------+----------+
only showing top 10 rows



In [17]:
randf = df.randomSplit([0.25, 0.75], 5)

In [18]:
randf[0].count() > randf[1].count()

False

In [19]:
df.sort("count").show(5)

+-----------------+--------------------+-----+
|DEST_COUNTRY_NAME| ORIGIN_COUNTRY_NAME|count|
+-----------------+--------------------+-----+
|         Slovakia|       United States|    1|
|          Liberia|       United States|    1|
|    United States|              Cyprus|    1|
|Equatorial Guinea|       United States|    1|
|    United States|Bosnia and Herzeg...|    1|
+-----------------+--------------------+-----+
only showing top 5 rows



In [20]:
df.orderBy(col("DEST_COUNTRY_NAME").asc(),col("count").desc()).show(5)

+-------------------+-------------------+-----+
|  DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-------------------+-------------------+-----+
|        Afghanistan|      United States|   11|
|             Angola|      United States|   14|
|           Anguilla|      United States|   21|
|Antigua and Barbuda|      United States|  123|
|          Argentina|      United States|  184|
+-------------------+-------------------+-----+
only showing top 5 rows



In [21]:
df.rdd.getNumPartitions()

1

In [22]:
lten = df.limit(10)

In [23]:
lten.take(5)
lten.show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
|    United States|          Singapore|   25|
|    United States|            Grenada|   54|
|       Costa Rica|      United States|  477|
|          Senegal|      United States|   29|
|    United States|   Marshall Islands|   44|
+-----------------+-------------------+-----+



In [26]:
from pyspark.sql.functions import lit
df.select(lit(5), lit("five"), lit(5.0))

DataFrame[5: int, five: string, 5.0: double]