In [26]:
import pyspark
from pyspark import SQLContext
sc = SparkContext.getOrCreate();
sql = SQLContext(sc)

df = (sql.read
         .format("com.databricks.spark.csv")
         .option("header", "true")
         .load("covid_dataset/country_wise_latest.csv"))

df.printSchema()
df.limit(3).show()

root
 |-- Country/Region: string (nullable = true)
 |-- Confirmed: string (nullable = true)
 |-- Deaths: string (nullable = true)
 |-- Recovered: string (nullable = true)
 |-- Active: string (nullable = true)
 |-- New cases: string (nullable = true)
 |-- New deaths: string (nullable = true)
 |-- New recovered: string (nullable = true)
 |-- Deaths / 100 Cases: string (nullable = true)
 |-- Recovered / 100 Cases: string (nullable = true)
 |-- Deaths / 100 Recovered: string (nullable = true)
 |-- Confirmed last week: string (nullable = true)
 |-- 1 week change: string (nullable = true)
 |-- 1 week % increase: string (nullable = true)
 |-- WHO Region: string (nullable = true)

+--------------+---------+------+---------+------+---------+----------+-------------+------------------+---------------------+----------------------+-------------------+-------------+-----------------+--------------------+
|Country/Region|Confirmed|Deaths|Recovered|Active|New cases|New deaths|New recovered|Deaths / 1

In [25]:
import pyspark
import pyspark.sql.types as Types
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName(appName) \
    .master(master) \
    .getOrCreate()

schema = Types.StructType([
    Types.StructField('Country/Region', Types.StringType(), True),
    Types.StructField('Population', Types.StringType(), True),
    Types.StructField('Urban Pop %', Types.StringType(), True),
    Types.StructField('World Share %', Types.StringType(), True),
    Types.StructField('Med. Age', Types.StringType(), True)
])

df = spark\
    .read\
    .json("world_population/population.json", schema, multiLine=True)
df.printSchema()
df.limit(3).show()

root
 |-- Country/Region: string (nullable = true)
 |-- Population: string (nullable = true)
 |-- Urban Pop %: string (nullable = true)
 |-- World Share %: string (nullable = true)
 |-- Med. Age: string (nullable = true)

+--------------+----------+-----------+-------------+--------+
|Country/Region|Population|Urban Pop %|World Share %|Med. Age|
+--------------+----------+-----------+-------------+--------+
|         China|1440297825|         61|        18.47|      38|
|         India|1382345085|         35|        17.70|      28|
| United States| 331341050|         83|         4.25|      38|
+--------------+----------+-----------+-------------+--------+



In [24]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.jars", "jars/postgresql-42.2.16.jar") \
    .getOrCreate()

df = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/dvdrental") \
    .option("dbtable", "payment") \
    .option("user", "postgres") \
    .option("password", "postgres") \
    .option("driver", "org.postgresql.Driver") \
    .load()

df.printSchema()

df = df.createOrReplaceTempView("payment_table_view")
df = spark.sql("Select * from payment_table_view where amount > 2.00")
df.limit(3).show()

root
 |-- payment_id: integer (nullable = true)
 |-- customer_id: short (nullable = true)
 |-- staff_id: short (nullable = true)
 |-- rental_id: integer (nullable = true)
 |-- amount: decimal(5,2) (nullable = true)
 |-- payment_date: timestamp (nullable = true)

+----------+-----------+--------+---------+------+--------------------+
|payment_id|customer_id|staff_id|rental_id|amount|        payment_date|
+----------+-----------+--------+---------+------+--------------------+
|     17503|        341|       2|     1520|  7.99|2007-02-15 22:25:...|
|     17505|        341|       1|     1849|  7.99|2007-02-16 22:41:...|
|     17506|        341|       2|     2829|  2.99|2007-02-19 19:39:...|
+----------+-----------+--------+---------+------+--------------------+

