In [None]:
# https://sparkbyexamples.com/pyspark/pyspark-show-display-dataframe-contents-in-table/

In [1]:
import findspark
findspark.init()
findspark.find()

'C:\\apps\\myspark'

In [2]:
# Import PySpark
from pyspark.sql import SparkSession

#Create SparkSession

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
spark

In [3]:
# Data
data = [("Java", "20000"), ("Python", "100000"), ("Scala", "3000")]

# Columns
columns = ["language","users_count"]

# Create DataFrame
df = spark.createDataFrame(data).toDF(*columns)

# Print DataFrame
df.show()

+--------+-----------+
|language|users_count|
+--------+-----------+
|    Java|      20000|
|  Python|     100000|
|   Scala|       3000|
+--------+-----------+



In [4]:
df.printSchema()

root
 |-- language: string (nullable = true)
 |-- users_count: string (nullable = true)



In [5]:
df.show(n=3,truncate=25,vertical=True)

-RECORD 0-------------
 language    | Java   
 users_count | 20000  
-RECORD 1-------------
 language    | Python 
 users_count | 100000 
-RECORD 2-------------
 language    | Scala  
 users_count | 3000   



In [6]:
df = spark.read.csv("./dat/shops/sh-1-2022.csv")
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)



In [7]:
df.count()

12960001

In [10]:
a = df.summary()

In [11]:
a.show()

+-------+--------+--------------------+-----------------+----------------+--------+-----------------+-----------------+-----------------+--------+
|summary|     _c0|                 _c1|              _c2|             _c3|     _c4|              _c5|              _c6|              _c7|     _c8|
+-------+--------+--------------------+-----------------+----------------+--------+-----------------+-----------------+-----------------+--------+
|  count|12960001|            12960001|         12960001|        12960001|12960001|         12960001|         12960001|         12960001|12960001|
|   mean|     1.0|              2022.0|              6.5|            15.5|    null|              3.0|             50.5|3.000376774691358|     5.0|
| stddev|     0.0|4.038514911957013...|3.452052662715734|8.65544178232836|    null|1.414213616933789|28.86607116138227|1.414081823767059|     0.0|
|    min|      01|                2022|               01|              01|   09:00|               01|               01

In [4]:
from datetime import datetime, date
import pandas as pd
from pyspark.sql import Row

sp_df1 = spark.createDataFrame([
    Row(a=1, b=2., c='string1', d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),
    Row(a=2, b=3., c='string2', d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),
    Row(a=4, b=5., c='string3', d=date(2000, 3, 1), e=datetime(2000, 1, 3, 12, 0))
])
sp_df1.printSchema()

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



In [5]:
spark.conf.set("spark.sql.execution.arrow.enabled","true")
sp_df2 = spark.createDataFrame([
    (1, 2., 'string1', date(2000, 1, 1), datetime(2000, 1, 1, 12, 0)),
    (2, 3., 'string2', date(2000, 2, 1), datetime(2000, 1, 2, 12, 0)),
    (3, 4., 'string3', date(2000, 3, 1), datetime(2000, 1, 3, 12, 0))
], schema='a long, b double, c string, d date, e timestamp')
sp_df2.printSchema()

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



In [6]:
pandas_df = pd.DataFrame({
    'a': [1, 2, 3],
    'b': [2., 3., 4.],
    'c': ['string1', 'string2', 'string3'],
    'd': [date(2000, 1, 1), date(2000, 2, 1), date(2000, 3, 1)],
    'e': [datetime(2000, 1, 1, 12, 0), datetime(2000, 1, 2, 12, 0), datetime(2000, 1, 3, 12, 0)]
})
print("pandas shape:", pandas_df.shape)
sp_df3 = spark.createDataFrame(pandas_df)
sp_df3.printSchema()

pandas shape: (3, 5)
root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



  PyArrow >= 1.0.0 must be installed; however, it was not found.
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  for column, series in pdf.iteritems():


In [7]:
sp_df4 = spark.createDataFrame([
    (1, 2.0, 'string1'),
    (2, 3.0, 'string2'),
    (3, 4.0, 'string3')
], schema='a long, b double, c string')
sp_df4.printSchema()
sp_df4.show()

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)

+---+---+-------+
|  a|  b|      c|
+---+---+-------+
|  1|2.0|string1|
|  2|3.0|string2|
|  3|4.0|string3|
+---+---+-------+

