## Setting Environment Variables

In [20]:
import os
import sys
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

## Create a DATAFRAME

In [21]:
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder \
            .appName('Transform in Pyspark') \
            .getOrCreate()

# Prepare Data
simpleData = (("Java",4000,5), \
    ("Python", 4600,10),  \
    ("Scala", 4100,15),   \
    ("Scala", 4500,15),   \
    ("PHP", 3000,20),  \
  )
columns= ["CourseName", "fee", "discount"]

# Create DataFrame
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()

root
 |-- CourseName: string (nullable = true)
 |-- fee: long (nullable = true)
 |-- discount: long (nullable = true)



## Transform

### Custom Functions

In [10]:
#custom_function_1
from pyspark.sql.functions import upper
def to_upper_str_columns(df):
    return df.withColumn("Course_name", upper(df.CourseName))

#custom_function_2
def reduce_price(df, reduceBy):
    return df.withColumn("new_fee", df.fee - reduceBy)

#custom_function_3
def apply_discount(df):
    return df.withColumn("discount_fee", df.new_fee - (df.new_fee * df.discount)/100)

In [12]:
df2 = df.transform(to_upper_str_columns) \
        .transform(reduce_price,1000) \
        .transform(apply_discount)

In [13]:
df2.show()

+----------+----+--------+-----------+-------+------------+
|CourseName| fee|discount|Course_name|new_fee|discount_fee|
+----------+----+--------+-----------+-------+------------+
|      Java|4000|       5|       JAVA|   3000|      2850.0|
|    Python|4600|      10|     PYTHON|   3600|      3240.0|
|     Scala|4100|      15|      SCALA|   3100|      2635.0|
|     Scala|4500|      15|      SCALA|   3500|      2975.0|
|       PHP|3000|      20|        PHP|   2000|      1600.0|
+----------+----+--------+-----------+-------+------------+



In [17]:
def sel_col(df):
    return df.select(df2.columns)

df3 = df2.transform(sel_col)
df3.show()

+----------+----+--------+-----------+-------+------------+
|CourseName| fee|discount|Course_name|new_fee|discount_fee|
+----------+----+--------+-----------+-------+------------+
|      Java|4000|       5|       JAVA|   3000|      2850.0|
|    Python|4600|      10|     PYTHON|   3600|      3240.0|
|     Scala|4100|      15|      SCALA|   3100|      2635.0|
|     Scala|4500|      15|      SCALA|   3500|      2975.0|
|       PHP|3000|      20|        PHP|   2000|      1600.0|
+----------+----+--------+-----------+-------+------------+



### Array Type

In [18]:
# Create DataFrame with Array
data = [
 ("James,,Smith",["Java","Scala","C++"],["Spark","Java"]),
 ("Michael,Rose,",["Spark","Java","C++"],["Spark","Java"]),
 ("Robert,,Williams",["CSharp","VB"],["Spark","Python"])
]
df = spark.createDataFrame(data=data,schema=["Name","Languages1","Languages2"])
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Languages1: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Languages2: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [19]:
# using transform() function
from pyspark.sql.functions import upper
from pyspark.sql.functions import transform
df.select(transform("Languages1", lambda x: upper(x)).alias("languages1")) \
  .show()

+------------------+
|        languages1|
+------------------+
|[JAVA, SCALA, C++]|
|[SPARK, JAVA, C++]|
|      [CSHARP, VB]|
+------------------+



## Apply()

In [22]:
columns = ["Seqno","Name"]
data = [("1", "john jones"),
    ("2", "tracey smith"),
    ("3", "amy sanders")]

df = spark.createDataFrame(data=data,schema=columns)
df

DataFrame[Seqno: string, Name: string]

### Applying Function using withColumn()

In [28]:
from pyspark.sql.functions import upper
df.withColumn("upper_name", upper(df.Name)).show()

+-----+------------+------------+
|Seqno|        Name|  upper_name|
+-----+------------+------------+
|    1|  john jones|  JOHN JONES|
|    2|tracey smith|TRACEY SMITH|
|    3| amy sanders| AMY SANDERS|
+-----+------------+------------+



### Applying with select

In [27]:
df.select("Seqno","name", upper(df.Name)).show()

+-----+------------+------------+
|Seqno|        name| upper(Name)|
+-----+------------+------------+
|    1|  john jones|  JOHN JONES|
|    2|tracey smith|TRACEY SMITH|
|    3| amy sanders| AMY SANDERS|
+-----+------------+------------+



### With Spark SQL

In [32]:
df.createOrReplaceTempView("TAB")
df_sql = spark.sql(
    """
        SELECT Seqno, Name, UPPER(NAME) from TAB
    """
)
df_sql.show()

+-----+------------+------------+
|Seqno|        Name| upper(NAME)|
+-----+------------+------------+
|    1|  john jones|  JOHN JONES|
|    2|tracey smith|TRACEY SMITH|
|    3| amy sanders| AMY SANDERS|
+-----+------------+------------+



### Create a custom function

In [33]:
def upper_case(str):
    return str.upper()

In [39]:
# Convert function to UDF
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType
upperCaseUDF = udf(lambda x: upper_case(x), StringType())

In [40]:
# Custom UDF withColumn
df.withColumn("upper", upperCaseUDF(col("Name"))).show()

+-----+------------+------------+
|Seqno|        Name|       upper|
+-----+------------+------------+
|    1|  john jones|  JOHN JONES|
|    2|tracey smith|TRACEY SMITH|
|    3| amy sanders| AMY SANDERS|
+-----+------------+------------+



In [42]:
# Custom UDF with Select
df.select("Seqno", "Name", upperCaseUDF(col("Name")).alias("NameUpper")).show()

+-----+------------+------------+
|Seqno|        Name|   NameUpper|
+-----+------------+------------+
|    1|  john jones|  JOHN JONES|
|    2|tracey smith|TRACEY SMITH|
|    3| amy sanders| AMY SANDERS|
+-----+------------+------------+



In [44]:
# Custom function with spark sql
spark.udf.register("upperCaseUDF", upperCaseUDF)
df.createOrReplaceTempView("TABLE")
spark.sql(
    """
        SELECT Seqno, Name, upperCaseUDF(Name) as upperCase from TABLE
    """
).show()

+-----+------------+------------+
|Seqno|        Name|   upperCase|
+-----+------------+------------+
|    1|  john jones|  JOHN JONES|
|    2|tracey smith|TRACEY SMITH|
|    3| amy sanders| AMY SANDERS|
+-----+------------+------------+



### Pyspark Pandas Apply

In [48]:
# !pip install pandas
# !pip install pyarrow

Collecting pyarrow
  Obtaining dependency information for pyarrow from https://files.pythonhosted.org/packages/db/1d/e8004776a69b5bad62b857367a9a2dff7c61d9606f341e549a174047349b/pyarrow-15.0.0-cp311-cp311-win_amd64.whl.metadata
  Downloading pyarrow-15.0.0-cp311-cp311-win_amd64.whl.metadata (3.1 kB)
Downloading pyarrow-15.0.0-cp311-cp311-win_amd64.whl (24.8 MB)
   ---------------------------------------- 0.0/24.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.8 MB 325.1 kB/s eta 0:01:17
   ---------------------------------------- 0.1/24.8 MB 558.5 kB/s eta 0:00:45
   ---------------------------------------- 0.2/24.8 MB 913.1 kB/s eta 0:00:27
   ---------------------------------------- 0.2/24.8 MB 1.1 MB/s eta 0:00:24
   ---------------------------------------- 0.2/24.8 MB 1.1 MB/s eta 0:00:24
   ---------------------------------------- 0.2/24.8 MB 1.1 MB/s eta 0:00:24
    ------------------------


[notice] A new release of pip is available: 23.2.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [49]:
# Import Libraries
import pyspark.pandas as pd
import numpy as np

technologies = ({
    'Fee' : [20000, 40000, 25000, 22000, np.NaN],
    'Discount' : [1000, 2500, 1500, 1200, 3000]
})

psdf = pd.DataFrame(technologies)
print(psdf)



       Fee  Discount
0  20000.0      1000
1  40000.0      2500
2  25000.0      1500
3  22000.0      1200
4      NaN      3000


In [51]:
def add(data):
    return data[0] + data[1]

In [52]:
addDF = psdf.apply(add, axis = 1)

  return data[0] + data[1]


In [53]:
print(addDF)

0    21000.0
1    42500.0
2    26500.0
3    23200.0
4        NaN
dtype: float64
