In [5]:
!pip install pyspark



In [6]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('latihan_1').getOrCreate()
spark

In [7]:
iris_df = spark.read.option("inferSchema", "true").option("header", "true").csv("iris.csv")

iris_df.show(10)

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|variety|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| Setosa|
|         4.9|        3.0|         1.4|        0.2| Setosa|
|         4.7|        3.2|         1.3|        0.2| Setosa|
|         4.6|        3.1|         1.5|        0.2| Setosa|
|         5.0|        3.6|         1.4|        0.2| Setosa|
|         5.4|        3.9|         1.7|        0.4| Setosa|
|         4.6|        3.4|         1.4|        0.3| Setosa|
|         5.0|        3.4|         1.5|        0.2| Setosa|
|         4.4|        2.9|         1.4|        0.2| Setosa|
|         4.9|        3.1|         1.5|        0.1| Setosa|
+------------+-----------+------------+-----------+-------+
only showing top 10 rows



In [8]:
iris_df.printSchema()

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- variety: string (nullable = true)



In [9]:
iris_df.select(["sepal_length","variety"]).show(5)

+------------+-------+
|sepal_length|variety|
+------------+-------+
|         5.1| Setosa|
|         4.9| Setosa|
|         4.7| Setosa|
|         4.6| Setosa|
|         5.0| Setosa|
+------------+-------+
only showing top 5 rows



In [10]:
iris_df.select(iris_df["sepal_length"] * 10).show(5)

+-------------------+
|(sepal_length * 10)|
+-------------------+
|               51.0|
|               49.0|
|               47.0|
|               46.0|
|               50.0|
+-------------------+
only showing top 5 rows



In [11]:
iris_df.describe().show()

+-------+------------------+-------------------+------------------+------------------+---------+
|summary|      sepal_length|        sepal_width|      petal_length|       petal_width|  variety|
+-------+------------------+-------------------+------------------+------------------+---------+
|  count|               150|                150|               150|               150|      150|
|   mean| 5.843333333333335|  3.057333333333334|3.7580000000000027| 1.199333333333334|     NULL|
| stddev|0.8280661279778637|0.43586628493669793|1.7652982332594662|0.7622376689603467|     NULL|
|    min|               4.3|                2.0|               1.0|               0.1|   Setosa|
|    max|               7.9|                4.4|               6.9|               2.5|Virginica|
+-------+------------------+-------------------+------------------+------------------+---------+



In [12]:
iris_df.summary().show()

+-------+------------------+-------------------+------------------+------------------+---------+
|summary|      sepal_length|        sepal_width|      petal_length|       petal_width|  variety|
+-------+------------------+-------------------+------------------+------------------+---------+
|  count|               150|                150|               150|               150|      150|
|   mean| 5.843333333333335|  3.057333333333334|3.7580000000000027| 1.199333333333334|     NULL|
| stddev|0.8280661279778637|0.43586628493669793|1.7652982332594662|0.7622376689603467|     NULL|
|    min|               4.3|                2.0|               1.0|               0.1|   Setosa|
|    25%|               5.1|                2.8|               1.6|               0.3|     NULL|
|    50%|               5.8|                3.0|               4.3|               1.3|     NULL|
|    75%|               6.4|                3.3|               5.1|               1.8|     NULL|
|    max|               7.9|  

In [13]:
iris_df.sort("sepal_length",ascending=False).show(5)

+------------+-----------+------------+-----------+---------+
|sepal_length|sepal_width|petal_length|petal_width|  variety|
+------------+-----------+------------+-----------+---------+
|         7.9|        3.8|         6.4|        2.0|Virginica|
|         7.7|        3.8|         6.7|        2.2|Virginica|
|         7.7|        2.8|         6.7|        2.0|Virginica|
|         7.7|        2.6|         6.9|        2.3|Virginica|
|         7.7|        3.0|         6.1|        2.3|Virginica|
+------------+-----------+------------+-----------+---------+
only showing top 5 rows



In [14]:
from pyspark.sql.functions import min,max
iris_df.select(min("sepal_length"), max("sepal_length")).show()

+-----------------+-----------------+
|min(sepal_length)|max(sepal_length)|
+-----------------+-----------------+
|              4.3|              7.9|
+-----------------+-----------------+



In [15]:
from pyspark.sql.functions import avg
iris_df.select(avg("sepal_length")).show()

+-----------------+
|avg(sepal_length)|
+-----------------+
|5.843333333333335|
+-----------------+



In [16]:
iris_df.groupBy("variety").avg("petal_width").show()

+----------+------------------+
|   variety|  avg(petal_width)|
+----------+------------------+
| Virginica|             2.026|
|    Setosa|0.2459999999999999|
|Versicolor|1.3259999999999998|
+----------+------------------+



In [18]:
iris_df.groupBy("variety").agg(
    avg("sepal_length").alias("avg_sepal_length"),
    avg("sepal_width").alias("avg_sepal_width"),
    avg("petal_width").alias("avg_petal_width"),
    avg("petal_width").alias("avg_petal_width")
).show()

+----------+-----------------+------------------+------------------+------------------+
|   variety| avg_sepal_length|   avg_sepal_width|   avg_petal_width|   avg_petal_width|
+----------+-----------------+------------------+------------------+------------------+
| Virginica|6.587999999999998|2.9739999999999998|             2.026|             2.026|
|    Setosa|5.005999999999999| 3.428000000000001|0.2459999999999999|0.2459999999999999|
|Versicolor|            5.936|2.7700000000000005|1.3259999999999998|1.3259999999999998|
+----------+-----------------+------------------+------------------+------------------+

