In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.master("local[4]") \
.appName("time") \
.config("spark.executor.memory","4g") \
.config("spark.driver.memory","2g") \
.getOrCreate()
sc = spark.sparkContext

In [2]:
df = spark.read \
.option("header","True") \
.option("inferSchema","True")\
.option("sep",";") \
.csv("/home/taha/Downloads/OnlineRetail.csv") \
.select("InvoiceDate").distinct()

In [3]:
df.show(5)

+----------------+
|     InvoiceDate|
+----------------+
| 3.12.2010 16:50|
| 7.12.2010 12:28|
| 8.12.2010 15:02|
|10.12.2010 09:53|
|12.12.2010 13:32|
+----------------+
only showing top 5 rows



In [4]:
mevcut_format = "dd.MM.yyyy HH:mm"

In [13]:
from pyspark.sql import functions as F

In [14]:
df2 = df \
.withColumn("normal_tarih",F.to_date(F.col("InvoiceDate"),mevcut_format)) \
.withColumn("standar_ts",F.to_timestamp(F.col("InvoiceDate"),mevcut_format))

df2.show(5)

+----------------+------------+-------------------+
|     InvoiceDate|normal_tarih|         standar_ts|
+----------------+------------+-------------------+
| 3.12.2010 16:50|  2010-12-03|2010-12-03 16:50:00|
| 7.12.2010 12:28|  2010-12-07|2010-12-07 12:28:00|
| 8.12.2010 15:02|  2010-12-08|2010-12-08 15:02:00|
|10.12.2010 09:53|  2010-12-10|2010-12-10 09:53:00|
|12.12.2010 13:32|  2010-12-12|2010-12-12 13:32:00|
+----------------+------------+-------------------+
only showing top 5 rows



In [13]:
df2.printSchema()

root
 |-- InvoiceDate: string (nullable = true)
 |-- normal_tarih: date (nullable = true)
 |-- standar_ts: timestamp (nullable = true)



# Tarih formati degistirme 

In [14]:
format_tr = "dd/MM/yyyy HH:mm:ss"

In [17]:
df3 = df2 \
.withColumn("TSTR",F.date_format(F.col("standar_ts"),format_tr)) 

In [19]:
df3.show(2)

+---------------+------------+-------------------+-------------------+
|    InvoiceDate|normal_tarih|         standar_ts|               TSTR|
+---------------+------------+-------------------+-------------------+
|3.12.2010 16:50|  2010-12-03|2010-12-03 16:50:00|03/12/2010 16:50:00|
|7.12.2010 12:28|  2010-12-07|2010-12-07 12:28:00|07/12/2010 12:28:00|
+---------------+------------+-------------------+-------------------+
only showing top 2 rows



# tarih ekleme tarih farkı alama timestap icinden yil alma

In [21]:
df4 = df2 \
.withColumn("bir_yil",F.date_add("standar_ts",365)) \
.withColumn("yil",F.year(F.col("standar_ts"))) \
.withColumn("fark",F.datediff(F.col("bir_yil"),df2.standar_ts))

df4.show(2)

+---------------+------------+-------------------+----------+----+----+
|    InvoiceDate|normal_tarih|         standar_ts|   bir_yil| yil|fark|
+---------------+------------+-------------------+----------+----+----+
|3.12.2010 16:50|  2010-12-03|2010-12-03 16:50:00|2011-12-03|2010| 365|
|7.12.2010 12:28|  2010-12-07|2010-12-07 12:28:00|2011-12-07|2010| 365|
+---------------+------------+-------------------+----------+----+----+
only showing top 2 rows

