In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.master("local[4]") \
.appName("Csv-Üzeri-SQL") \
.config("spark.executor.memory","4g") \
.config("spark.driver.memory","2g") \
.getOrCreate()

## Veriyi okurken sadece tarih-saat sütununu seçelim

In [2]:
df = spark.read \
.option("header","True") \
.option("inferSchema","True") \
.option("sep",";") \
.csv("D:\\Datasets\\OnlineRetail.csv")

In [3]:
df.show(5)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|1.12.2010 08:26|     2,55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|1.12.2010 08:26|     2,75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
only showing top 5 rows



Formatı anlamak için beş satırdan gün ve ay belli olmuyor daha fazla satır görelim.

In [4]:
df.show(15)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|1.12.2010 08:26|     2,55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|1.12.2010 08:26|     2,75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|1.12.2010 08:26|     7,65|     17850|United Kingdom|
|   536365|    21730|GLASS STAR FROSTE...|       6|1.12.2010 08:

Evet şimdi anlaşıldı. Format gün.ay.yıl saat:dakika
yani dd.MM.yyyy HH:mm

Datetime da ise varsayılan format yyyy-MM-dd HH:mm:ss

In [5]:
mevcut_format = 'dd.MM.yyyy HH:mm'

# Örnek TarihSaat Operasyonları

In [6]:
from pyspark.sql import functions as F

df2 = df.withColumn("InvoiceDate", F.trim(F.col("InvoiceDate"))) \
.withColumn("normal_tarih", F.to_date(F.col("InvoiceDate"), mevcut_format)) \
.withColumn("standart_ts", F.to_timestamp(F.col("InvoiceDate"), mevcut_format)) \

df2.show(10)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+------------+-------------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|normal_tarih|        standart_ts|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+------------+-------------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|1.12.2010 08:26|     2,55|     17850|United Kingdom|  2010-12-01|2010-12-01 08:26:00|
|   536365|    71053| WHITE METAL LANTERN|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|  2010-12-01|2010-12-01 08:26:00|
|   536365|   84406B|CREAM CUPID HEART...|       8|1.12.2010 08:26|     2,75|     17850|United Kingdom|  2010-12-01|2010-12-01 08:26:00|
|   536365|   84029G|KNITTED UNION FLA...|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|  2010-12-01|2010-12-01 08:26:00|
|   536365|   84029E|RED WOOLLY HOTTIE...

# Tarih Formatı Değiştirme

In [7]:
format_tr = "dd/MM/yyyy HH:mm:ss"
format_eng = "MM-dd-yyyy HH:mm:ss"

df3 = df2 \
.withColumn("TSTR", F.date_format(F.col("standart_ts"), format_tr)) \
.withColumn("TSENG", F.date_format(F.col("standart_ts"), format_eng)) \
.withColumn("unix_time", F.unix_timestamp(F.col("standart_ts"))) \

df3.show(10)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+------------+-------------------+-------------------+-------------------+----------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|normal_tarih|        standart_ts|               TSTR|              TSENG| unix_time|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+------------+-------------------+-------------------+-------------------+----------+
|   536365|   85123A|WHITE HANGING HEA...|       6|1.12.2010 08:26|     2,55|     17850|United Kingdom|  2010-12-01|2010-12-01 08:26:00|01/12/2010 08:26:00|12-01-2010 08:26:00|1291184760|
|   536365|    71053| WHITE METAL LANTERN|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|  2010-12-01|2010-12-01 08:26:00|01/12/2010 08:26:00|12-01-2010 08:26:00|1291184760|
|   536365|   84406B|CREAM CUPID HEART...|       8|1.12.2010

# Tarih ekleme, tarih farkı, timestamp içinden yılı alma

In [8]:
df4 = df2 \
.withColumn("bir_yil", F.date_add(F.col("standart_ts"), 365)) \
.withColumn("yil", F.year(F.col("standart_ts"))) \
.withColumn("fark", F.datediff(F.col("bir_yil"), F.col("standart_ts")))


df4.show(10)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+------------+-------------------+----------+----+----+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|normal_tarih|        standart_ts|   bir_yil| yil|fark|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+------------+-------------------+----------+----+----+
|   536365|   85123A|WHITE HANGING HEA...|       6|1.12.2010 08:26|     2,55|     17850|United Kingdom|  2010-12-01|2010-12-01 08:26:00|2011-12-01|2010| 365|
|   536365|    71053| WHITE METAL LANTERN|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|  2010-12-01|2010-12-01 08:26:00|2011-12-01|2010| 365|
|   536365|   84406B|CREAM CUPID HEART...|       8|1.12.2010 08:26|     2,75|     17850|United Kingdom|  2010-12-01|2010-12-01 08:26:00|2011-12-01|2010| 365|
|   536365|   84029G|KNITTED UNION FLA...|       6|1