# DataFrames
Are easier and faster

> Similar to an RDD, a DataFrame in Spark is an immutable distributed collection of
data. But unlike in an RDD, the data is organized into named columns, like a table in
a relational database. This is meant to make processing of large datasets easier. 

In [1]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, sum

In [2]:
spark = SparkSession.builder.appName("demo").getOrCreate()

24/05/16 14:32:06 WARN Utils: Your hostname, msi-MAG resolves to a loopback address: 127.0.1.1; using 192.168.0.129 instead (on interface wlp3s0)
24/05/16 14:32:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/16 14:32:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
dept_emps = [("Sales", "Barb", 40), ("Sales", "Dan", 20),
("IT", "Alex", 22), ("IT", "Jane", 24),
("HR", "Alex", 20), ("HR", "Mary", 30)]
df = spark.createDataFrame(dept_emps, ["dept", "name", "hours"])

In [4]:
# Group the same depts together, aggregate their hours, and compute an average
averages = (df.groupBy("dept")
.agg(avg("hours").alias('average'),
sum("hours").alias('total')))

In [5]:
averages.show()

[Stage 0:>                                                        (0 + 16) / 16]

+-----+-------+-----+
| dept|average|total|
+-----+-------+-----+
|Sales|   30.0|   60|
|   IT|   23.0|   46|
|   HR|   25.0|   50|
+-----+-------+-----+



                                                                                

In [11]:
spark.stop()

# ETL example with DataFrames

In [6]:
import os

In [8]:
path = "/home/msi/data-algorithms-with-spark/code/chap01/data"

In [9]:
input_path = os.path.join(path,"census_2010.json")
input_path

In [12]:
spark = SparkSession.builder \
.master("local") \
.appName("ETL") \
.getOrCreate()

In [13]:
census_df = spark.read.json(input_path)
census_df.show(10)

+---+-------+-------+----+
|age|females|  males|year|
+---+-------+-------+----+
|  0|1994141|2085528|2010|
|  1|1997991|2087350|2010|
|  2|2000746|2088549|2010|
|  3|2002756|2089465|2010|
|  4|2004366|2090436|2010|
|  5|2005925|2091803|2010|
|  6|2007781|2093905|2010|
|  7|2010281|2097080|2010|
|  8|2013771|2101670|2010|
|  9|2018603|2108014|2010|
+---+-------+-------+----+
only showing top 10 rows



In [14]:
# how many rows
census_df.count()

101

In [15]:
census_df[census_df['age'] > 54].show(10)

+---+-------+-------+----+
|age|females|  males|year|
+---+-------+-------+----+
| 55|2167706|2059204|2010|
| 56|2106460|1989505|2010|
| 57|2048896|1924113|2010|
| 58|2001049|1869486|2010|
| 59|1957350|1819943|2010|
| 60|1908602|1765522|2010|
| 61|1859542|1710843|2010|
| 62|1794846|1642438|2010|
| 63|1706899|1553454|2010|
| 64|1604801|1452106|2010|
+---+-------+-------+----+
only showing top 10 rows



In [16]:
census_df[census_df['age'] > 54].count() # how many seniors

46

In [17]:
from pyspark.sql.functions import lit
# pyspark.sql.functions.lit() -> Creates a Column of literal value.
seniors = census_df[census_df['age'] > 54]
seniors_final = seniors.withColumn('total', lit(seniors.males + seniors.females))

In [18]:
seniors_final.show()

+---+-------+-------+----+-------+
|age|females|  males|year|  total|
+---+-------+-------+----+-------+
| 55|2167706|2059204|2010|4226910|
| 56|2106460|1989505|2010|4095965|
| 57|2048896|1924113|2010|3973009|
| 58|2001049|1869486|2010|3870535|
| 59|1957350|1819943|2010|3777293|
| 60|1908602|1765522|2010|3674124|
| 61|1859542|1710843|2010|3570385|
| 62|1794846|1642438|2010|3437284|
| 63|1706899|1553454|2010|3260353|
| 64|1604801|1452106|2010|3056907|
| 65|1505088|1353125|2010|2858213|
| 66|1404227|1253164|2010|2657391|
| 67|1314295|1164006|2010|2478301|
| 68|1242906|1092883|2010|2335789|
| 69|1184673|1034415|2010|2219088|
| 70|1126180| 975512|2010|2101692|
| 71|1069608| 918217|2010|1987825|
| 72|1018530| 865438|2010|1883968|
| 73| 973223| 817131|2010|1790354|
| 74| 932810| 772524|2010|1705334|
+---+-------+-------+----+-------+
only showing top 20 rows



> another way using col instead

In [20]:

from pyspark.sql.functions import col
seniors.withColumn('total', col("males") + col("females")).show()

+---+-------+-------+----+-------+
|age|females|  males|year|  total|
+---+-------+-------+----+-------+
| 55|2167706|2059204|2010|4226910|
| 56|2106460|1989505|2010|4095965|
| 57|2048896|1924113|2010|3973009|
| 58|2001049|1869486|2010|3870535|
| 59|1957350|1819943|2010|3777293|
| 60|1908602|1765522|2010|3674124|
| 61|1859542|1710843|2010|3570385|
| 62|1794846|1642438|2010|3437284|
| 63|1706899|1553454|2010|3260353|
| 64|1604801|1452106|2010|3056907|
| 65|1505088|1353125|2010|2858213|
| 66|1404227|1253164|2010|2657391|
| 67|1314295|1164006|2010|2478301|
| 68|1242906|1092883|2010|2335789|
| 69|1184673|1034415|2010|2219088|
| 70|1126180| 975512|2010|2101692|
| 71|1069608| 918217|2010|1987825|
| 72|1018530| 865438|2010|1883968|
| 73| 973223| 817131|2010|1790354|
| 74| 932810| 772524|2010|1705334|
+---+-------+-------+----+-------+
only showing top 20 rows



In [22]:
usr = 'root'
psw = "mysql"

## Write into a DB
I cannot do it, does not work, sth must be missing

In [25]:
jdbc_driver_jar = "/usr/share/java/mysql-connector-java-8.4.0.jar"

In [30]:
(seniors_final
.write
.format("jdbc")
.option("url", "jdbc:mysql://localhost:3306/SparkDB")
.option("dbtable", "census")
.option("user", usr)
.option("password", psw)
.option("driver", "com.mysql.cj.jdbc.Driver")
.option("driver", jdbc_driver_jar) 
.mode("overwrite") 
.save())

Py4JJavaError: An error occurred while calling o181.save.
: java.lang.ClassNotFoundException: /usr/share/java/mysql-connector-java-8.4.0.jar
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:445)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:587)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:520)
	at org.apache.spark.sql.execution.datasources.jdbc.DriverRegistry$.register(DriverRegistry.scala:46)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.$anonfun$driverClass$1(JDBCOptions.scala:103)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.$anonfun$driverClass$1$adapted(JDBCOptions.scala:103)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:103)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcOptionsInWrite.<init>(JDBCOptions.scala:246)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcOptionsInWrite.<init>(JDBCOptions.scala:250)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:47)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:47)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:75)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:73)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:84)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:118)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:195)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:103)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:512)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:104)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:512)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:31)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:31)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:31)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:488)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:133)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:856)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:387)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:360)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:247)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:842)
