In [61]:
import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.sql.types import (
    StructType, 
    StructField, 
    StringType, 
    FloatType, 
    IntegerType
)

In [26]:
spark = SparkSession.builder.getOrCreate()

In [50]:
data = sns.load_dataset("diamonds").to_dict("records")

In [51]:
schema = StructType([
    StructField("carat", FloatType(), nullable = True), 
    StructField("cut", StringType(), nullable = True), 
    StructField("color", StringType(), nullable = True), 
    StructField("clarity", StringType(), nullable = True), 
    StructField("depth", FloatType(), nullable = True), 
    StructField("table", FloatType(), nullable = True), 
    StructField("price", IntegerType(), nullable = True), 
    StructField("x", FloatType(), nullable = True), 
    StructField("y", FloatType(), nullable = True), 
    StructField("z", FloatType(), nullable = True)
])

In [56]:
df = spark.createDataFrame(data, schema)

In [64]:
df.withColumn("price_ten_times", col("carat") * 10).show()

+-----+---------+-----+-------+-----+-----+-----+----+----+----+---------------+
|carat|      cut|color|clarity|depth|table|price|   x|   y|   z|price_ten_times|
+-----+---------+-----+-------+-----+-----+-----+----+----+----+---------------+
| 0.23|    Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|            2.3|
| 0.21|  Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|            2.1|
| 0.23|     Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|            2.3|
| 0.29|  Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|      2.8999999|
| 0.31|     Good|    J|    SI2| 63.3| 58.0|  335|4.34|4.35|2.75|            3.1|
| 0.24|Very Good|    J|   VVS2| 62.8| 57.0|  336|3.94|3.96|2.48|      2.3999999|
| 0.24|Very Good|    I|   VVS1| 62.3| 57.0|  336|3.95|3.98|2.47|      2.3999999|
| 0.26|Very Good|    H|    SI1| 61.9| 55.0|  337|4.07|4.11|2.53|            2.6|
| 0.22|     Fair|    E|    VS2| 65.1| 61.0|  337|3.87|3.78|2.49|            2.2|
| 0.23|Very Good|    H|    V

In [57]:
df.describe().show()

23/06/04 23:05:32 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 19:>                                                         (0 + 1) / 1]

+-------+-------------------+---------+-----+-------+------------------+------------------+-----------------+------------------+------------------+------------------+
|summary|              carat|      cut|color|clarity|             depth|             table|            price|                 x|                 y|                 z|
+-------+-------------------+---------+-----+-------+------------------+------------------+-----------------+------------------+------------------+------------------+
|  count|              53940|    53940|53940|  53940|             53940|             53940|            53940|             53940|             53940|             53940|
|   mean| 0.7979397459442544|     null| null|   null|61.749404890324215|57.457183908399585|3932.799721913237| 5.731157212872659| 5.734525955793015|3.5387337920972493|
| stddev|0.47401124283690627|     null| null|   null|1.4326213206653997|2.2344905638396404|3989.439738146377|1.1217607437465191|1.1421346736744051|0.7056988432752043

                                                                                

In [60]:
df.printSchema()

root
 |-- carat: float (nullable = true)
 |-- cut: string (nullable = true)
 |-- color: string (nullable = true)
 |-- clarity: string (nullable = true)
 |-- depth: float (nullable = true)
 |-- table: float (nullable = true)
 |-- price: integer (nullable = true)
 |-- x: float (nullable = true)
 |-- y: float (nullable = true)
 |-- z: float (nullable = true)

