In [1]:
import pandas as pd
import findspark
findspark.init()
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder. \
master("local[4]"). \
appName("Dataframe-Giriş"). \
config("spark.driver.memory","2g"). \
config("spark.executor.memory","4g"). \
getOrCreate()

In [3]:
sc = spark.sparkContext

# Listeden DF

In [5]:
from pyspark.sql import Row
list_rdd = sc.parallelize([1,2,3,4,5,6,5,4]). \
map(lambda x: Row(x))

In [6]:
df_from_list = list_rdd.toDF(["Value"])

In [7]:
df_from_list.show()

+-----+
|Value|
+-----+
|    1|
|    2|
|    3|
|    4|
|    5|
|    6|
|    5|
|    4|
+-----+



In [8]:
# Range
df_from_range = sc.parallelize(range(10,100,5)). \
map(lambda x: (x,)). \
toDF(["range"])

In [9]:
df_from_range.show(3)

+-----+
|range|
+-----+
|   10|
|   15|
|   20|
+-----+
only showing top 3 rows



# Dosyadan DF

In [10]:
df_from_file = spark.read.csv("OnlineRetail.csv")

In [12]:
df_from_file.show()

+--------------------+
|                 _c0|
+--------------------+
|InvoiceNo;StockCo...|
|536365;85123A;WHI...|
|536365;71053;WHIT...|
|536365;84406B;CRE...|
|536365;84029G;KNI...|
|536365;84029E;RED...|
|536365;22752;SET ...|
|536365;21730;GLAS...|
|536366;22633;HAND...|
|536366;22632;HAND...|
|536367;84879;ASSO...|
|536367;22745;POPP...|
|536367;22748;POPP...|
|536367;22749;FELT...|
|536367;22310;IVOR...|
|536367;84969;BOX ...|
|536367;22623;BOX ...|
|536367;22622;BOX ...|
|536367;21754;HOME...|
|536367;21755;LOVE...|
+--------------------+
only showing top 20 rows



In [13]:
df_from_file = spark.read \
.option("sep",";") \
.option("header","True") \
.option("inferSchema","True") \
.csv("OnlineRetail.csv")

df_from_file.show(3)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|1.12.2010 08:26|     2,55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|1.12.2010 08:26|     2,75|     17850|United Kingdom|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
only showing top 3 rows



In [14]:
df_from_file.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: string (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



In [15]:
df_pd = df_from_file.limit(5).toPandas()

In [16]:
type(df_pd)

pandas.core.frame.DataFrame

In [17]:
df_pd.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,1.12.2010 08:26,255,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,1.12.2010 08:26,339,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,1.12.2010 08:26,275,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,1.12.2010 08:26,339,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,1.12.2010 08:26,339,17850,United Kingdom


# READ CSV

In [20]:
retailDF = spark.read \
.option("header","True") \
.option("inferSchema","True") \
.option("sep",";") \
.csv("OnlineRetail.csv")

In [21]:
retailDF.limit(5).toPandas().head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,1.12.2010 08:26,255,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,1.12.2010 08:26,339,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,1.12.2010 08:26,275,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,1.12.2010 08:26,339,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,1.12.2010 08:26,339,17850,United Kingdom


In [27]:
retailDF.cache()

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: string, CustomerID: int, Country: string]

In [28]:
retailDF.createOrReplaceTempView("tablo")

In [29]:
spark.sql("""
SELECT Country, SUM(UnitPrice) UnitPrice 
FROM tablo
GROUP BY Country
ORDER BY UnitPrice DESC
""").show(20)

+---------------+---------+
|        Country|UnitPrice|
+---------------+---------+
| United Kingdom|  94911.0|
|           EIRE|   9423.0|
|        Germany|   7930.0|
|         France|   6288.0|
|          Spain|   2927.0|
|        Finland|   1578.0|
|        Belgium|   1503.0|
|         Norway|   1451.0|
|    Switzerland|   1267.0|
|         Sweden|    921.0|
|         Cyprus|    890.0|
|    Netherlands|    874.0|
|          Italy|    823.0|
|       Portugal|    817.0|
|Channel Islands|    736.0|
|        Austria|    548.0|
|      Australia|    350.0|
|        Denmark|    246.0|
|          Malta|    210.0|
|         Poland|    200.0|
+---------------+---------+
only showing top 20 rows



# Dataframe String Functions


In [30]:
df = spark.read \
.option("header","True") \
.option("inferSchema","True") \
.option("sep",",") \
.csv("simple_dirty_data.csv")

In [31]:
df.show()

+------+---------+---+--------+-----------+-----------+-----------+--------------------+
|sirano|     isim|yas|cinsiyet|     meslek|      sehir|aylik_gelir|            mal_mulk|
+------+---------+---+--------+-----------+-----------+-----------+--------------------+
|     1|    Cemal| 35|       E|       Isci|     Ankara|     3500.0|               araba|
|     2|   ceyda | 42|       K|      Memur|    Kayseri|     4200.0|            araba|ev|
|     3|    Timur| 30|    null|   Müzüsyen|Istanbul   |     9000.0|     araba|ev|yazlık|
|     4|   Burcu | 29|       K|Pazarlamacı|     Ankara|     4200.0|               araba|
|     5|  Yasemin| 23|       K|Pazarlamaci|      Bursa|     4800.0|               araba|
|     6|      Ali| 33|       E|      Memur|     Ankara|     4250.0|                  ev|
|     7|    Dilek| 29|       K|Pazarlamaci|   Istanbul|     7300.0|        araba|yazlık|
|     8|    Murat| 31|       E|   Müzüsyen|   Istanbul|    12000.0|araba|ev|dükkan|y...|
|     9|    Ahmet| 33

In [32]:
from pyspark.sql.functions import *

# Concat

In [33]:
df.select("meslek","sehir") \
.withColumn("meslek_sehir", concat(col("meslek"),lit(" - "),col("sehir"))) \
.show(truncate=False)

+-----------+-----------+------------------------+
|meslek     |sehir      |meslek_sehir            |
+-----------+-----------+------------------------+
|Isci       |Ankara     |Isci - Ankara           |
|Memur      |Kayseri    |Memur - Kayseri         |
|Müzüsyen   |Istanbul   |Müzüsyen - Istanbul     |
|Pazarlamacı|    Ankara |Pazarlamacı -     Ankara|
|Pazarlamaci|Bursa      |Pazarlamaci - Bursa     |
|Memur      |Ankara     |Memur - Ankara          |
|Pazarlamaci|Istanbul   |Pazarlamaci - Istanbul  |
|Müzüsyen   |Istanbul   |Müzüsyen - Istanbul     |
|Doktor     |Ankara     |Doktor - Ankara         |
|Berber     | Istanbul  |Berber -  Istanbul      |
|Tuhafiyeci |null       |null                    |
|Tornacı    | Ankara    |Tornacı -  Ankara       |
|memur      |Çorum      |memur - Çorum           |
|Doktor     |İzmir      |Doktor - İzmir          |
|Müzisyen   | Ankara    |Müzisyen -  Ankara      |
+-----------+-----------+------------------------+



# Number Format

In [34]:
df.select("aylik_gelir") \
.withColumn("aylik_gelir_format", format_number(col("aylik_gelir"), 2)) \
.show()

+-----------+------------------+
|aylik_gelir|aylik_gelir_format|
+-----------+------------------+
|     3500.0|          3,500.00|
|     4200.0|          4,200.00|
|     9000.0|          9,000.00|
|     4200.0|          4,200.00|
|     4800.0|          4,800.00|
|     4250.0|          4,250.00|
|     7300.0|          7,300.00|
|    12000.0|         12,000.00|
|   180000.0|        180,000.00|
|    12000.0|         12,000.00|
|        4.8|              4.80|
|     4200.0|          4,200.00|
|     3750.0|          3,750.00|
|    14250.0|         14,250.00|
|     8700.0|          8,700.00|
+-----------+------------------+



# lower, initcap, length

In [35]:
df.select("meslek","isim","sehir") \
.withColumn("meslek_lower", lower(col("meslek"))) \
.withColumn("isim_initcap", initcap(col("isim"))) \
.withColumn("sehir_length", length(col("sehir"))) \
.show()

+-----------+---------+-----------+------------+------------+------------+
|     meslek|     isim|      sehir|meslek_lower|isim_initcap|sehir_length|
+-----------+---------+-----------+------------+------------+------------+
|       Isci|    Cemal|     Ankara|        isci|       Cemal|           6|
|      Memur|   ceyda |    Kayseri|       memur|      Ceyda |           7|
|   Müzüsyen|    Timur|Istanbul   |    müzüsyen|       Timur|          11|
|Pazarlamacı|   Burcu |     Ankara| pazarlamacı|      Burcu |          10|
|Pazarlamaci|  Yasemin|      Bursa| pazarlamaci|     Yasemin|           5|
|      Memur|      Ali|     Ankara|       memur|         Ali|           6|
|Pazarlamaci|    Dilek|   Istanbul| pazarlamaci|       Dilek|           8|
|   Müzüsyen|    Murat|   Istanbul|    müzüsyen|       Murat|           8|
|     Doktor|    Ahmet|     Ankara|      doktor|       Ahmet|           6|
|     Berber| Muhittin|   Istanbul|      berber|    Muhittin|           9|
| Tuhafiyeci| Hicaziye|  

# Trim

In [36]:
df.select("sehir") \
.withColumn("sehir_rtrim", rtrim(col("sehir"))) \
.withColumn("sehir_ltrim", ltrim(col("sehir"))) \
.withColumn("sehir_trim", trim(col("sehir"))) \
.show()

+-----------+-----------+-----------+----------+
|      sehir|sehir_rtrim|sehir_ltrim|sehir_trim|
+-----------+-----------+-----------+----------+
|     Ankara|     Ankara|     Ankara|    Ankara|
|    Kayseri|    Kayseri|    Kayseri|   Kayseri|
|Istanbul   |   Istanbul|Istanbul   |  Istanbul|
|     Ankara|     Ankara|     Ankara|    Ankara|
|      Bursa|      Bursa|      Bursa|     Bursa|
|     Ankara|     Ankara|     Ankara|    Ankara|
|   Istanbul|   Istanbul|   Istanbul|  Istanbul|
|   Istanbul|   Istanbul|   Istanbul|  Istanbul|
|     Ankara|     Ankara|     Ankara|    Ankara|
|   Istanbul|   Istanbul|   Istanbul|  Istanbul|
|       null|       null|       null|      null|
|    Ankara |     Ankara|    Ankara |    Ankara|
|     Çorum |      Çorum|     Çorum |     Çorum|
|      İzmir|      İzmir|      İzmir|     İzmir|
|     Ankara|     Ankara|     Ankara|    Ankara|
+-----------+-----------+-----------+----------+



# replace, split

In [37]:
df.select("sehir","mal_mulk") \
.withColumn("sehir_ist", regexp_replace(col("sehir"), "Ist", "İST")) \
.withColumn("mal_mulk_split", split(col("mal_mulk"), "\\|")) \
.withColumn("mal_mulk_ilk", col("mal_mulk_split")[0]) \
.show(truncate=False)

+-----------+----------------------+-----------+---------------------------+------------+
|sehir      |mal_mulk              |sehir_ist  |mal_mulk_split             |mal_mulk_ilk|
+-----------+----------------------+-----------+---------------------------+------------+
|Ankara     |araba                 |Ankara     |[araba]                    |araba       |
|Kayseri    |araba|ev              |Kayseri    |[araba, ev]                |araba       |
|Istanbul   |araba|ev|yazlık       |İSTanbul   |[araba, ev, yazlık]        |araba       |
|    Ankara |araba                 |    Ankara |[araba]                    |araba       |
|Bursa      |araba                 |Bursa      |[araba]                    |araba       |
|Ankara     |ev                    |Ankara     |[ev]                       |ev          |
|Istanbul   |araba|yazlık          |İSTanbul   |[araba, yazlık]            |araba       |
|Istanbul   |araba|ev|dükkan|yazlık|İSTanbul   |[araba, ev, dükkan, yazlık]|araba       |
|Ankara   

# ManualSchema

In [38]:
data = spark.read \
.option("header","True") \
.option("inferSchema","True") \
.option("sep",";") \
.csv("OnlineRetail.csv")

In [39]:
data.show()

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|1.12.2010 08:26|     2,55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|1.12.2010 08:26|     2,75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|1.12.2010 08:26|     7,65|     17850|United Kingdom|
|   536365|    21730|GLASS STAR FROSTE...|       6|1.12.2010 08:

In [40]:
data.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: string (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



In [41]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, FloatType

In [43]:
manual_schema = StructType(
[
    StructField("InvoiceNo", StringType(), True),
    StructField("StockCode", StringType(), True),
    StructField("Description", StringType(), True),
    StructField("Quantity", IntegerType(), True),
    StructField("InvoiceDate", StringType(), True),
    StructField("UnitPrice", StringType(), True),
    StructField("CustomerID", IntegerType(), True),
    StructField("Country", StringType(), True)
]
)

In [44]:
data2 = spark.read \
.option("header","True") \
.schema(manual_schema) \
.option("sep",";") \
.csv("OnlineRetail.csv")

In [53]:
from pyspark import SQLContext
sqc = SQLContext(spark)



In [54]:
df.registerTempTable("data2")



In [56]:
sqc.sql("Select * from data2").show()

+------+---------+---+--------+-----------+-----------+-----------+--------------------+
|sirano|     isim|yas|cinsiyet|     meslek|      sehir|aylik_gelir|            mal_mulk|
+------+---------+---+--------+-----------+-----------+-----------+--------------------+
|     1|    Cemal| 35|       E|       Isci|     Ankara|     3500.0|               araba|
|     2|   ceyda | 42|       K|      Memur|    Kayseri|     4200.0|            araba|ev|
|     3|    Timur| 30|    null|   Müzüsyen|Istanbul   |     9000.0|     araba|ev|yazlık|
|     4|   Burcu | 29|       K|Pazarlamacı|     Ankara|     4200.0|               araba|
|     5|  Yasemin| 23|       K|Pazarlamaci|      Bursa|     4800.0|               araba|
|     6|      Ali| 33|       E|      Memur|     Ankara|     4250.0|                  ev|
|     7|    Dilek| 29|       K|Pazarlamaci|   Istanbul|     7300.0|        araba|yazlık|
|     8|    Murat| 31|       E|   Müzüsyen|   Istanbul|    12000.0|araba|ev|dükkan|y...|
|     9|    Ahmet| 33