In [1]:
import pyspark 
from pyspark.sql import SparkSession
 
# Create a spark session
spark = SparkSession.builder.appName('DataFrame_operations').getOrCreate()
 
# Create data in dataframe
data = [(('Ram'), '1991-04-01', 'M', 3000),
       (('Mike'), '2000-05-19', 'M', 4000),
       (('Rohini'), '1978-09-05', 'M', 4000),
       (('Maria'), '1967-12-01', 'F', 4000),
       (('Jenis'), '1980-02-17', 'F', 1200)]
 
# Column names in dataframe
columns = ["Name", "DOB", "Gender", "salary"]
 
# Create the spark dataframe
df = spark.createDataFrame(data=data ,schema=columns)
 


In [None]:
df_columnRename = df.withColumnRenamed('DOB','DateOfBirth')
df_columnRename.printSchema()
df_columnRename.show()

root
 |-- Name: string (nullable = true)
 |-- DateOfBirth: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- salary: long (nullable = true)

+------+-----------+------+------+
|  Name|DateOfBirth|Gender|salary|
+------+-----------+------+------+
|   Ram| 1991-04-01|     M|  3000|
|  Mike| 2000-05-19|     M|  4000|
|Rohini| 1978-09-05|     M|  4000|
| Maria| 1967-12-01|     F|  4000|
| Jenis| 1980-02-17|     F|  1200|
+------+-----------+------+------+



In [None]:
df.withColumnRenamed('DOB','DateOfBirth').withColumnRenamed('salary',"Salary").show()

+------+-----------+------+------+
|  Name|DateOfBirth|Gender|Salary|
+------+-----------+------+------+
|   Ram| 1991-04-01|     M|  3000|
|  Mike| 2000-05-19|     M|  4000|
|Rohini| 1978-09-05|     M|  4000|
| Maria| 1967-12-01|     F|  4000|
| Jenis| 1980-02-17|     F|  1200|
+------+-----------+------+------+



In [None]:
from pyspark.sql.functions import col
data = df.select(col("Name"),col("DOB"),col("Gender"),col("salary").alias('Payment'))
data.show()

+------+----------+------+-------+
|  Name|       DOB|Gender|Payment|
+------+----------+------+-------+
|   Ram|1991-04-01|     M|   3000|
|  Mike|2000-05-19|     M|   4000|
|Rohini|1978-09-05|     M|   4000|
| Maria|1967-12-01|     F|   4000|
| Jenis|1980-02-17|     F|   1200|
+------+----------+------+-------+



In [None]:
df_selectExpr = df.selectExpr("Name as names","DOB",'Gender','salary')
df_selectExpr.show()

+------+----------+------+------+
| names|       DOB|Gender|salary|
+------+----------+------+------+
|   Ram|1991-04-01|     M|  3000|
|  Mike|2000-05-19|     M|  4000|
|Rohini|1978-09-05|     M|  4000|
| Maria|1967-12-01|     F|  4000|
| Jenis|1980-02-17|     F|  1200|
+------+----------+------+------+



In [None]:
datalist = ['First Name','DateOfBirth','M/F','Payment']
toDF = df.toDF(*datalist)
toDF.show()

+----------+-----------+---+-------+
|First Name|DateOfBirth|M/F|Payment|
+----------+-----------+---+-------+
|       Ram| 1991-04-01|  M|   3000|
|      Mike| 2000-05-19|  M|   4000|
|    Rohini| 1978-09-05|  M|   4000|
|     Maria| 1967-12-01|  F|   4000|
|     Jenis| 1980-02-17|  F|   1200|
+----------+-----------+---+-------+



In [None]:
import pyspark 
from pyspark.sql import SparkSession
 
# Create a spark session
spark = SparkSession.builder.appName('Groupby -- Aggregate').getOrCreate()
df_agg = spark.read.csv("/FileStore/tables/employee/Employees.csv")
df_agg.show()

+-----+-------+-------+
|  _c0|    _c1|    _c2|
+-----+-------+-------+
|EmpID|   Name| Salary|
| e101| Pramod|1200000|
| e120| Dinesh|2200000|
| e205|Sabesta|1500000|
| e331|  Harry|1700000|
| e421|Avinash|1300000|
| e231|    Joy|2300000|
| e222|  Smith|2100000|
| e339|   Khan|1800000|
| e150|  Dilip|1900000|
| e131|  Kiran| 800000|
+-----+-------+-------+



In [None]:
df.show()
df.groupBy("Gender").sum('salary').show()

+------+----------+------+------+
|  Name|       DOB|Gender|salary|
+------+----------+------+------+
|   Ram|1991-04-01|     M|  3000|
|  Mike|2000-05-19|     M|  4000|
|Rohini|1978-09-05|     M|  4000|
| Maria|1967-12-01|     F|  4000|
| Jenis|1980-02-17|     F|  1200|
+------+----------+------+------+

+------+-----------+
|Gender|sum(salary)|
+------+-----------+
|     M|      11000|
|     F|       5200|
+------+-----------+

