In [1]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession

In [4]:
# Aşağıdaki ayarları bilgisayarınızın belleğine göre değiştirebilirsiniz
spark = SparkSession.builder \
.master("local[4]") \
.appName("Dataset-Olusturmak") \
.config("spark.executor.memory","4g") \
.config("spark.driver.memory","2g") \
.getOrCreate()

# sparkContext'i kısaltmada tut
sc = spark.sparkContext

# listeden dataframe oluşturmak

In [23]:
from pyspark.sql import Row
list_rdd = sc.parallelize([1,2,3,4,5,6,4,5]).map(lambda x: Row(x))

In [30]:
# Sütun ismi bir tane bile olsa Python listesi olarak parametre verilir
df_from_list = list_rdd.toDF(['rakamlar'])

In [31]:
df_from_list.collect()

[Row(rakamlar=1),
 Row(rakamlar=2),
 Row(rakamlar=3),
 Row(rakamlar=4),
 Row(rakamlar=5),
 Row(rakamlar=6),
 Row(rakamlar=4),
 Row(rakamlar=5)]

# range ile dataframe yaratmak

## Yöntem-1

In [36]:
df_from_range = sc.parallelize(range(10,100,5)). \
map(lambda x: (x,)). \
toDF(["range"])

In [38]:
df_from_range.take(3)

[Row(range=10), Row(range=15), Row(range=20)]

## Yöntem-2

In [40]:
from pyspark.sql.types import IntegerType
df_from_range2 =spark.createDataFrame(range(10,100,5), IntegerType())

In [41]:
df_from_range2.take(3)

[Row(value=10), Row(value=15), Row(value=20)]

# Dosyadan veri okuyarak Dataframe oluşturmak

In [48]:
df_from_file = spark.read.csv("D:\\Datasets\\OnlineRetail.csv")

In [49]:
df_from_file.take(3)

[Row(_c0='InvoiceNo;StockCode;Description;Quantity;InvoiceDate;UnitPrice;CustomerID;Country'),
 Row(_c0='536365;85123A;WHITE HANGING HEART T-LIGHT HOLDER;6;1.12.2010 08:26;2'),
 Row(_c0='536365;71053;WHITE METAL LANTERN;6;1.12.2010 08:26;3')]

In [50]:
df_from_file = spark.read. \
option("sep",";"). \
csv("D:\\Datasets\\OnlineRetail.csv")

In [51]:
df_from_file.take(3)

[Row(_c0='InvoiceNo', _c1='StockCode', _c2='Description', _c3='Quantity', _c4='InvoiceDate', _c5='UnitPrice', _c6='CustomerID', _c7='Country'),
 Row(_c0='536365', _c1='85123A', _c2='WHITE HANGING HEART T-LIGHT HOLDER', _c3='6', _c4='1.12.2010 08:26', _c5='2,55', _c6='17850', _c7='United Kingdom'),
 Row(_c0='536365', _c1='71053', _c2='WHITE METAL LANTERN', _c3='6', _c4='1.12.2010 08:26', _c5='3,39', _c6='17850', _c7='United Kingdom')]

In [53]:
df_from_file.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)



In [56]:
df_from_file = spark.read. \
option("sep",";"). \
option("header","True"). \
option("inferSchema","True"). \
csv("D:\\Datasets\\OnlineRetail.csv")

In [58]:
df_from_file.show(3)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|1.12.2010 08:26|     2,55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|1.12.2010 08:26|     2,75|     17850|United Kingdom|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
only showing top 3 rows



In [59]:
df_from_file.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: string (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



In [60]:
df_from_file.count()

541909

In [61]:
df_from_file.sort("Quantity").explain()

== Physical Plan ==
*(2) Sort [Quantity#86 ASC NULLS FIRST], true, 0
+- Exchange rangepartitioning(Quantity#86 ASC NULLS FIRST, 200)
   +- *(1) FileScan csv [InvoiceNo#83,StockCode#84,Description#85,Quantity#86,InvoiceDate#87,UnitPrice#88,CustomerID#89,Country#90] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/D:/Datasets/OnlineRetail.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<InvoiceNo:string,StockCode:string,Description:string,Quantity:int,InvoiceDate:string,UnitP...


In [63]:
# Dinamik conf ayarı ve shuffle partition sayısını değiştirme
spark.conf.set("spark.sql.shuffle.partitions","5")

In [68]:
# Yeni conf ile sort
df_from_file.select("Description","Quantity").sort("Quantity").explain()

== Physical Plan ==
*(2) Sort [Quantity#86 ASC NULLS FIRST], true, 0
+- Exchange rangepartitioning(Quantity#86 ASC NULLS FIRST, 5)
   +- *(1) FileScan csv [Description#85,Quantity#86] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/D:/Datasets/OnlineRetail.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<Description:string,Quantity:int>


In [69]:
# Dinamik conf ayarı ve shuffle partition sayısını değiştirme
spark.conf.set("spark.sql.shuffle.partitions","5")
# Yeni conf ile sort
df_from_file.select("Description","Quantity").sort("Quantity").explain()

== Physical Plan ==
*(2) Sort [Quantity#86 ASC NULLS FIRST], true, 0
+- Exchange rangepartitioning(Quantity#86 ASC NULLS FIRST, 5)
   +- *(1) FileScan csv [Description#85,Quantity#86] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/D:/Datasets/OnlineRetail.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<Description:string,Quantity:int>
