In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.master("local[4]") \
.appName("stringOPS") \
.config("spark.executor.memory","4g") \
.config("spark.driver.memory","2g") \
.getOrCreate()
sc = spark.sparkContext

In [2]:
df = spark.read \
.option("header","True") \
.option("inferSchema","True") \
.option("sep",",") \
.csv("/home/taha/Downloads/simple_data.csv")

In [3]:
df.show(3)

+------+-----+---+--------+--------+-----------+
|sirano| isim|yas|  meslek|   sehir|aylik_gelir|
+------+-----+---+--------+--------+-----------+
|     1|Cemal| 35|    Isci|  Ankara|       3500|
|     2|Ceyda| 42|   Memur| Kayseri|       4200|
|     3|Timur| 30|Müzisyen|Istanbul|       9000|
+------+-----+---+--------+--------+-----------+
only showing top 3 rows



In [4]:
# sql string ops yapıcaz hepsini indierlim
from pyspark.sql.functions import *

# 1.Concat

In [7]:
# 2 stunun veya 2 stringi birlestiren fonksiyon
df_concat = df \
.withColumn("meslek-sehir",concat(col("meslek"),lit("-"),col("sehir")))

df_concat.show(n=3,truncate=False)

+------+-----+---+--------+--------+-----------+-----------------+
|sirano|isim |yas|meslek  |sehir   |aylik_gelir|meslek-sehir     |
+------+-----+---+--------+--------+-----------+-----------------+
|1     |Cemal|35 |Isci    |Ankara  |3500       |Isci-Ankara      |
|2     |Ceyda|42 |Memur   |Kayseri |4200       |Memur-Kayseri    |
|3     |Timur|30 |Müzisyen|Istanbul|9000       |Müzisyen-Istanbul|
+------+-----+---+--------+--------+-----------+-----------------+
only showing top 3 rows



# 2. Number Format

In [10]:
# numaraları duzenler
df_number_format= df \
.withColumn("aylik_gelir_F",format_number(col("aylik_gelir"),2))

df_number_format.show(n=3, truncate=False)

+------+-----+---+--------+--------+-----------+-------------+
|sirano|isim |yas|meslek  |sehir   |aylik_gelir|aylik_gelir_F|
+------+-----+---+--------+--------+-----------+-------------+
|1     |Cemal|35 |Isci    |Ankara  |3500       |3,500.00     |
|2     |Ceyda|42 |Memur   |Kayseri |4200       |4,200.00     |
|3     |Timur|30 |Müzisyen|Istanbul|9000       |9,000.00     |
+------+-----+---+--------+--------+-----------+-------------+
only showing top 3 rows



# 3. lower,initcap,length

In [11]:
# kucultme,basharf buyutme , uzunluk
df_lower = df \
.withColumn("meslek_lower",lower(col("meslek")))

df_lower.show(n=3 , truncate=False)

print("*************************")

df_initcap = df \
.withColumn("isim_initcap",initcap(col("isim")))

df_initcap.show(n=3 , truncate=False)

print("*************************")

df_length = df \
.withColumn("sehir_length",length(col("sehir")))

df_length.show(n=3 , truncate=False)


+------+-----+---+--------+--------+-----------+------------+
|sirano|isim |yas|meslek  |sehir   |aylik_gelir|meslek_lower|
+------+-----+---+--------+--------+-----------+------------+
|1     |Cemal|35 |Isci    |Ankara  |3500       |isci        |
|2     |Ceyda|42 |Memur   |Kayseri |4200       |memur       |
|3     |Timur|30 |Müzisyen|Istanbul|9000       |müzisyen    |
+------+-----+---+--------+--------+-----------+------------+
only showing top 3 rows

*************************
+------+-----+---+--------+--------+-----------+------------+
|sirano|isim |yas|meslek  |sehir   |aylik_gelir|isim_initcap|
+------+-----+---+--------+--------+-----------+------------+
|1     |Cemal|35 |Isci    |Ankara  |3500       |Cemal       |
|2     |Ceyda|42 |Memur   |Kayseri |4200       |Ceyda       |
|3     |Timur|30 |Müzisyen|Istanbul|9000       |Timur       |
+------+-----+---+--------+--------+-----------+------------+
only showing top 3 rows

*************************
+------+-----+---+--------+---

# 4. trim

In [12]:
# bosluk silme
df_rtrim = df \
.withColumn("sehir_rtrim",rtrim(col("sehir")))

df_rtrim.show(n=3 , truncate=False)

print("*************************")

df_ltrim = df \
.withColumn("sehir_ltrim",ltrim(col("sehir")))

df_ltrim.show(n=3 , truncate=False)

print("*************************")

df_trim = df \
.withColumn("sehir_trim",trim(col("sehir")))

df_trim.show(n=3 , truncate=False)


+------+-----+---+--------+--------+-----------+-----------+
|sirano|isim |yas|meslek  |sehir   |aylik_gelir|sehir_rtrim|
+------+-----+---+--------+--------+-----------+-----------+
|1     |Cemal|35 |Isci    |Ankara  |3500       |Ankara     |
|2     |Ceyda|42 |Memur   |Kayseri |4200       |Kayseri    |
|3     |Timur|30 |Müzisyen|Istanbul|9000       |Istanbul   |
+------+-----+---+--------+--------+-----------+-----------+
only showing top 3 rows

*************************
+------+-----+---+--------+--------+-----------+-----------+
|sirano|isim |yas|meslek  |sehir   |aylik_gelir|sehir_ltrim|
+------+-----+---+--------+--------+-----------+-----------+
|1     |Cemal|35 |Isci    |Ankara  |3500       |Ankara     |
|2     |Ceyda|42 |Memur   |Kayseri |4200       |Kayseri    |
|3     |Timur|30 |Müzisyen|Istanbul|9000       |Istanbul   |
+------+-----+---+--------+--------+-----------+-----------+
only showing top 3 rows

*************************
+------+-----+---+--------+--------+--------

# replace , split

In [14]:
# yerine koyar ve belli seye gore boler
df_replace = df \
.withColumn("sehir_replace",regexp_replace(col("sehir"),"An","Man"))

df_replace.show(n=3 , truncate=False)

print("*************************")

df_split = df \
.withColumn("sehir_split",split(col("sehir"),"a"))

df_split.show(n=3 , truncate=False)


+------+-----+---+--------+--------+-----------+-------------+
|sirano|isim |yas|meslek  |sehir   |aylik_gelir|sehir_replace|
+------+-----+---+--------+--------+-----------+-------------+
|1     |Cemal|35 |Isci    |Ankara  |3500       |Mankara      |
|2     |Ceyda|42 |Memur   |Kayseri |4200       |Kayseri      |
|3     |Timur|30 |Müzisyen|Istanbul|9000       |Istanbul     |
+------+-----+---+--------+--------+-----------+-------------+
only showing top 3 rows

*************************
+------+-----+---+--------+--------+-----------+-----------+
|sirano|isim |yas|meslek  |sehir   |aylik_gelir|sehir_split|
+------+-----+---+--------+--------+-----------+-----------+
|1     |Cemal|35 |Isci    |Ankara  |3500       |[Ank, r, ] |
|2     |Ceyda|42 |Memur   |Kayseri |4200       |[K, yseri] |
|3     |Timur|30 |Müzisyen|Istanbul|9000       |[Ist, nbul]|
+------+-----+---+--------+--------+-----------+-----------+
only showing top 3 rows



In [15]:
# ayirma isleminde | buna gore ayir demek icin \\| yapmaliyiz

In [16]:
df_split.printSchema()

root
 |-- sirano: integer (nullable = true)
 |-- isim: string (nullable = true)
 |-- yas: integer (nullable = true)
 |-- meslek: string (nullable = true)
 |-- sehir: string (nullable = true)
 |-- aylik_gelir: integer (nullable = true)
 |-- sehir_split: array (nullable = true)
 |    |-- element: string (containsNull = true)

