In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.streaming import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import os

spark = SparkSession.builder.master("local").appName("Timestamp").getOrCreate()
sc = spark.sparkContext

In [4]:
# sql adaptive query execution adaptive.coalescePartitions.enabled will makr paritions dynamic
sc.setLogLevel("Error")
spark.conf.set("spark.sql.shuffle.partitions",3)
spark.conf.get("spark.sql.shuffle.partitions")
spark.conf.set("spark.sql.adaptive.enabled","false")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled","false")
spark.conf.set("spark.sql.execution.arrow.enabled",True)
spark.conf.set("spark.sql.execution.arrow.fallback.enabled",True)

"""
Read the file with timestamp as format
1. Option("timeStampformat","M/d/yyyy") - This method is useful when we have all columns with same date format
Option(dateformat) -will still recognise as string
2.Use custom schema and load it with option(dateformat) while reading file
"""

#WithoutInferSchema
#headerTrue will read first row and assign column names but type is String for all
#spark job created to read first column
filepath = "file:///C:/Users/venka/PycharmProjects/pythonProject/dataset/"
df = spark.read.option("header",True) \
            .option("inferSchema",True) \
            .option("delimiter",",") \
            .csv(filepath + "IntPersonal_transactions.csv")

In [6]:
#Infer Schema recognises it as string because default format is mm-dd-yyyy
df.schema

StructType([StructField('Customer_No', IntegerType(), True), StructField('Card_type', StringType(), True), StructField('Date', StringType(), True), StructField('Category', StringType(), True), StructField('Transaction Type', StringType(), True), StructField('Amount', DoubleType(), True)])

In [9]:
#Method1
df1 = spark.read.option("header",True) \
            .option("inferSchema",True) \
            .option("timestampformat","M/d/yyyy") \
            .option("delimiter",",") \
            .csv(filepath + "IntPersonal_transactions.csv")
df1.schema

StructType([StructField('Customer_No', IntegerType(), True), StructField('Card_type', StringType(), True), StructField('Date', TimestampType(), True), StructField('Category', StringType(), True), StructField('Transaction Type', StringType(), True), StructField('Amount', DoubleType(), True)])

In [16]:
schema = StructType([
    StructField('Customer_No', IntegerType(), True),
    StructField('Card_type', StringType(), True),
    StructField('Date', DateType(), True),
    StructField('Category', StringType(), True),
    StructField('Transaction Type', StringType(), True),
    StructField('Amount', FloatType(), True),
])

df2 = spark.read.option("header",True) \
            .option("inferSchema",True) \
            .option("dateformat","M/d/yyyy") \
            .option("delimiter",",") \
            .csv(filepath + "IntPersonal_transactions.csv",schema = schema)

df2.printSchema()

root
 |-- Customer_No: integer (nullable = true)
 |-- Card_type: string (nullable = true)
 |-- Date: date (nullable = true)
 |-- Category: string (nullable = true)
 |-- Transaction Type: string (nullable = true)
 |-- Amount: float (nullable = true)



In [17]:
df2.show()

+-----------+-------------+----------+-------------------+----------------+-------+
|Customer_No|    Card_type|      Date|           Category|Transaction Type| Amount|
+-----------+-------------+----------+-------------------+----------------+-------+
|    1000501|Platinum Card|2018-01-01|           Shopping|           debit|  11.11|
|    1000501|     Checking|2018-01-02|    Mortgage & Rent|           debit|1247.44|
|    1000501|  Silver Card|2018-01-02|        Restaurants|           debit|  24.22|
|    1000501|Platinum Card|2018-01-03|Credit Card Payment|          credit|2298.09|
|    1000501|Platinum Card|2018-01-04|      Movies & DVDs|           debit|  11.76|
|    1000501|  Silver Card|2018-01-05|        Restaurants|           debit|  25.85|
|    1000501|  Silver Card|2018-01-06|   Home Improvement|           debit|  18.45|
|    1000501|     Checking|2018-01-08|          Utilities|           debit|   45.0|
|    1000501|  Silver Card|2018-01-08|   Home Improvement|           debit| 