In [1]:
import os
import sys

os.environ["JAVA_HOME"] = "JDK 8/Contents/Home"
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [17]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.functions import col

from pyspark.sql.types import IntegerType


spark = SparkSession.builder.appName("Saving and Project").getOrCreate()

In [6]:
data = [
    (1, "Moby Dick", "Herman Melville", 1851),
    (2, "Treasure Island", "Robert Louis Stevenson", 1883),
    (3, "Robinson Crusoe", "Daniel Defoe", 1719),
    (4, "The Fellowship of the Ring", "J. R. R. Tolkien", 1954),
    (5, "The Hitchhikers Guide to the Galaxy", "Douglas Adams", 1979)
    ]

columns = ["ID", "book_name", "author", "publish_date"]

df_books = spark.createDataFrame(data, columns)
df_books.show()

                                                                                

+---+--------------------+--------------------+------------+
| ID|           book_name|              author|publish_date|
+---+--------------------+--------------------+------------+
|  1|           Moby Dick|     Herman Melville|        1851|
|  2|     Treasure Island|Robert Louis Stev...|        1883|
|  3|     Robinson Crusoe|        Daniel Defoe|        1719|
|  4|The Fellowship of...|    J. R. R. Tolkien|        1954|
|  5|The Hitchhikers G...|       Douglas Adams|        1979|
+---+--------------------+--------------------+------------+



In [8]:
output_path = "./csv_out"
df_books.write.csv(output_path, header=True)

                                                                                

In [40]:
import pandas as pd



In [41]:
df = spark.read.csv("lightening strikes dataset.csv", header=True)
df.show()

+----------+-----------------+-----------------+
|      date|number_of_strikes|center_point_geom|
+----------+-----------------+-----------------+
|2018-01-03|              194|    POINT(-75 27)|
|2018-01-03|               41|  POINT(-78.4 29)|
|2018-01-03|               33|  POINT(-73.9 27)|
|2018-01-03|               38|  POINT(-73.8 27)|
|2018-01-03|               92|    POINT(-79 28)|
|2018-01-03|              119|    POINT(-78 28)|
|2018-01-03|               35|  POINT(-79.3 28)|
|2018-01-03|               60|  POINT(-79.1 28)|
|2018-01-03|               41|  POINT(-78.7 28)|
|2018-01-03|              119|  POINT(-78.6 28)|
|2018-01-03|              107|  POINT(-78.5 28)|
|2018-01-03|              158|  POINT(-78.4 28)|
|2018-01-03|              168|  POINT(-78.3 28)|
|2018-01-03|              167|  POINT(-78.2 28)|
|2018-01-03|              121|  POINT(-78.1 28)|
|2018-01-03|               47|  POINT(-77.9 28)|
|2018-01-03|               33|  POINT(-75.3 27)|
|2018-01-03|        

In [42]:
df = df.withColumn("number_of_strikes", col("number_of_strikes").cast(IntegerType()))
df = df.dropna()

df.show()

+----------+-----------------+-----------------+
|      date|number_of_strikes|center_point_geom|
+----------+-----------------+-----------------+
|2018-01-03|              194|    POINT(-75 27)|
|2018-01-03|               41|  POINT(-78.4 29)|
|2018-01-03|               33|  POINT(-73.9 27)|
|2018-01-03|               38|  POINT(-73.8 27)|
|2018-01-03|               92|    POINT(-79 28)|
|2018-01-03|              119|    POINT(-78 28)|
|2018-01-03|               35|  POINT(-79.3 28)|
|2018-01-03|               60|  POINT(-79.1 28)|
|2018-01-03|               41|  POINT(-78.7 28)|
|2018-01-03|              119|  POINT(-78.6 28)|
|2018-01-03|              107|  POINT(-78.5 28)|
|2018-01-03|              158|  POINT(-78.4 28)|
|2018-01-03|              168|  POINT(-78.3 28)|
|2018-01-03|              167|  POINT(-78.2 28)|
|2018-01-03|              121|  POINT(-78.1 28)|
|2018-01-03|               47|  POINT(-77.9 28)|
|2018-01-03|               33|  POINT(-75.3 27)|
|2018-01-03|        

In [43]:
df_sorted = df.orderBy(desc("number_of_strikes"))
df_sorted.show()



+----------+-----------------+-----------------+
|      date|number_of_strikes|center_point_geom|
+----------+-----------------+-----------------+
|2018-08-20|             2211|POINT(-92.5 35.5)|
|2018-08-16|             2142|POINT(-96.1 36.1)|
|2018-08-17|             2061|POINT(-90.2 36.1)|
|2018-08-17|             2031|POINT(-89.9 35.9)|
|2018-08-16|             1902|POINT(-96.2 36.1)|
|2018-02-10|             1899|POINT(-95.5 28.1)|
|2018-08-16|             1878|POINT(-89.7 31.5)|
|2018-02-25|             1833|POINT(-98.7 28.9)|
|2018-08-17|             1767|  POINT(-90.1 36)|
|2018-02-25|             1741|    POINT(-98 29)|
|2018-02-11|             1686|  POINT(-88.7 29)|
|2018-02-25|             1655|POINT(-98.6 28.9)|
|2018-08-17|             1629|POINT(-89.4 35.7)|
|2018-01-11|             1611|  POINT(-76 24.1)|
|2018-02-22|             1559|POINT(-96.6 32.7)|
|2018-02-11|             1513|POINT(-88.6 29.1)|
|2018-08-25|             1497|POINT(-85.3 39.5)|
|2018-08-28|        

                                                                                

In [44]:
avg_strikes_per_location = df.groupBy("center_point_geom").agg(round(avg("number_of_strikes").alias("Avg Strikes"),2))
avg_strikes_per_location.show()



+-----------------+-------------------------------------------------+
|center_point_geom|round(avg(number_of_strikes) AS `Avg Strikes`, 2)|
+-----------------+-------------------------------------------------+
|POINT(-95.3 29.4)|                                            17.52|
|POINT(-92.6 24.9)|                                             6.65|
|POINT(-92.2 26.1)|                                             8.64|
|POINT(-91.3 28.1)|                                            11.44|
|POINT(-93.9 28.1)|                                             12.3|
|POINT(-87.4 25.2)|                                             8.94|
|POINT(-86.9 26.4)|                                            11.91|
|POINT(-76.3 26.7)|                                            14.53|
|  POINT(-85.5 26)|                                            10.03|
|POINT(-76.1 25.7)|                                             11.0|
|POINT(-99.9 33.6)|                                            26.79|
|  POINT(-77 26.5)| 

                                                                                

In [45]:
pandas_df = avg_strikes_per_location.toPandas()

pandas_df.to_csv("Ordered_lightning_strikes.csv")
print("CSV file saved successfully!")

                                                                                

IsADirectoryError: [Errno 21] Is a directory: 'Ordered_lightning_strikes.csv'

In [56]:
url = "jdbc:postgresql://localhost:5432/Bank"

properties = {
    "user": "postgres",
    "password": "cd090264"
}

df = spark.read.jdbc(url=url, table="customer", properties=properties)
df.show()

+-----------+---------+--------------------+-------------------+----------+--------------------+
|customer_id|branch_id|        custom_email|custom_phone_number|       dob|             address|
+-----------+---------+--------------------+-------------------+----------+--------------------+
|          1|        1|smathet0@wikimedi...|         3873521982|1982-07-25|   3956 Luster Court|
|          2|        3|bradleigh1@elegan...|         4071191740|2001-02-21|   844 Crowley Plaza|
|          3|        3|kwallington2@illi...|         6488849541|1985-02-11|65734 David Crossing|
|          4|        2|sdelwater3@yahoo....|         5273621186|2001-04-03|  60 Jackson Terrace|
|          5|        5|    bdreng4@admin.ch|         1149438413|1993-07-07|    252 Hintze Trail|
|          6|        5|   lkerman5@usda.gov|         4062650274|2003-01-10|      190 Havey Lane|
|          7|        1| bcastree6@youku.com|         7655139497|1975-04-14| 50 Pierstorff Alley|
|          8|        3|   qwom

In [53]:
pip show pyspark

Name: pyspark
Version: 3.5.4
Summary: Apache Spark Python API
Home-page: https://github.com/apache/spark/tree/master/python
Author: Spark Developers
Author-email: dev@spark.apache.org
License: http://www.apache.org/licenses/LICENSE-2.0
Location: /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages
Requires: py4j
Required-by: 
Note: you may need to restart the kernel to use updated packages.
