In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.master("local[2]").appName("DataSet").getOrCreate()

In [5]:
filepath = "../../data/users_data*.csv"

In [32]:
dataframe = spark.read.format("csv").options(path=filepath,header=True,inferSchema = True,delimiter = ",").load()
# dataframe = spark.read.options(header=True,inferSchema = "True").csv(filepath)

In [64]:
dataframe.show(2)

+---+---------+----------+------+---------------+------------+------+-------+---------------+
| id|user_name|    mobile|salary|user_created_at|        addr|  city|country|address_created|
+---+---------+----------+------+---------------+------------+------+-------+---------------+
|  1|   taukir|8010339935| 50000|     2023-11-17|    Richmond|   B.C| Canada|     2023-11-17|
|  1|   taukir|8010339935| 50000|     2023-11-17|natesh puram|Meerut|  India|     2023-11-17|
+---+---------+----------+------+---------------+------------+------+-------+---------------+
only showing top 2 rows



In [65]:
# dataframe.printSchema()

# Create SChema for DataFrame

In [38]:
from pyspark.sql.types import StructType, StructField,StringType,IntegerType,DateType

In [40]:
schema = StructType([
    StructField("id",IntegerType(), True),  # whether this filed can have Null values,it can have that way it is True
    StructField("user_name",StringType(), True),
    StructField("mobile",StringType(), True),
    StructField("salary",IntegerType(), True),
    StructField("user_created_at",DateType(), True),
    StructField("addr",StringType(), True),
    StructField("city",StringType(), True),
    StructField("country",StringType(), True),
     StructField("address_created",DateType(), True)
])

In [41]:
dataframe2 = spark.read.format("csv").options(path=filepath,header=True,delimiter = ",").schema(schema).load()

In [66]:
# dataframe2.printSchema()

In [67]:
dataframe2.show(2)

+---+---------+----------+------+---------------+------------+------+-------+---------------+
| id|user_name|    mobile|salary|user_created_at|        addr|  city|country|address_created|
+---+---------+----------+------+---------------+------------+------+-------+---------------+
|  1|   taukir|8010339935| 50000|     2023-11-17|    Richmond|   B.C| Canada|     2023-11-17|
|  1|   taukir|8010339935| 50000|     2023-11-17|natesh puram|Meerut|  India|     2023-11-17|
+---+---------+----------+------+---------------+------------+------+-------+---------------+
only showing top 2 rows



# Select DataFrame Columns

In [51]:
from pyspark.sql import functions

In [70]:
dataframe.select("user_name",dataframe.mobile,dataframe['salary'],functions.col('user_created_at')).show(2)
# dataframe[['salary','mobile']].show(2)
# dataframe.select("*").show(2)
# dataframe.select(dataframe.columns[0:3]).show(2)

+---------+----------+------+---------------+
|user_name|    mobile|salary|user_created_at|
+---------+----------+------+---------------+
|   taukir|8010339935| 50000|     2023-11-17|
|   taukir|8010339935| 50000|     2023-11-17|
+---------+----------+------+---------------+
only showing top 2 rows



# withColumn

In [80]:
dataframe.withColumn("user_name2",functions.col('user_name')).\
withColumn("mobile",functions.col('mobile').cast('long')).\
withColumn("salary_inc",functions.col('salary') * 10).\
withColumn("country",functions.lit('INDIA')).select("*").show(3)

+---+---------+----------+------+---------------+------------+------+-------+---------------+----------+----------+
| id|user_name|    mobile|salary|user_created_at|        addr|  city|country|address_created|user_name2|salary_inc|
+---+---------+----------+------+---------------+------------+------+-------+---------------+----------+----------+
|  1|   taukir|8010339935| 50000|     2023-11-17|    Richmond|   B.C|  INDIA|     2023-11-17|    taukir|    500000|
|  1|   taukir|8010339935| 50000|     2023-11-17|natesh puram|Meerut|  INDIA|     2023-11-17|    taukir|    500000|
|  2|     khan|8010223369| 40000|     2023-11-17|Khhan Market| Delhi|  INDIA|     2023-11-17|      khan|    400000|
+---+---------+----------+------+---------------+------------+------+-------+---------------+----------+----------+
only showing top 3 rows



# withColumnRenamed

In [89]:
dataframe.withColumnRenamed("mobile","cell_phone").\
withColumnRenamed("user_name2","user_name").show(2)

+---+---------+----------+------+---------------+------------+------+-------+---------------+
| id|user_name|cell_phone|salary|user_created_at|        addr|  city|country|address_created|
+---+---------+----------+------+---------------+------------+------+-------+---------------+
|  1|   taukir|8010339935| 50000|     2023-11-17|    Richmond|   B.C| Canada|     2023-11-17|
|  1|   taukir|8010339935| 50000|     2023-11-17|natesh puram|Meerut|  India|     2023-11-17|
+---+---------+----------+------+---------------+------------+------+-------+---------------+
only showing top 2 rows



In [91]:
dataframe.select("*").withColumnRenamed("mobile","cell_phone").show(2)

+---+---------+----------+------+---------------+------------+------+-------+---------------+
| id|user_name|cell_phone|salary|user_created_at|        addr|  city|country|address_created|
+---+---------+----------+------+---------------+------------+------+-------+---------------+
|  1|   taukir|8010339935| 50000|     2023-11-17|    Richmond|   B.C| Canada|     2023-11-17|
|  1|   taukir|8010339935| 50000|     2023-11-17|natesh puram|Meerut|  India|     2023-11-17|
+---+---------+----------+------+---------------+------------+------+-------+---------------+
only showing top 2 rows



In [93]:
dataframe.select("user_name",functions.col("mobile").alias("cell_phone")).show(2)

+---------+----------+
|user_name|cell_phone|
+---------+----------+
|   taukir|8010339935|
|   taukir|8010339935|
+---------+----------+
only showing top 2 rows



# Datafrme with Filter

In [118]:
# dataframe.filter(dataframe.country == 'India').show()
# dataframe.filter((dataframe.country == 'India') & (dataframe.salary > 35000)).show(3)
# dataframe.filter(dataframe.country.isin(['India','US'])).show(3)
# dataframe.filter(dataframe.country.startswith('Ca')).show(2)
# dataframe.filter(dataframe.country.endswith('ia')).show(2)
# dataframe.filter(dataframe.country.contains('na')).show(2)
dataframe.filter(dataframe.country.like('%na%')).show(2)

+---+---------+----------+------+---------------+--------+----+-------+---------------+
| id|user_name|    mobile|salary|user_created_at|    addr|city|country|address_created|
+---+---------+----------+------+---------------+--------+----+-------+---------------+
|  1|   taukir|8010339935| 50000|     2023-11-17|Richmond| B.C| Canada|     2023-11-17|
|  5|   zeenat|   1560489| 46000|     2023-11-17|Richmond| B.C| Canada|     2023-11-17|
+---+---------+----------+------+---------------+--------+----+-------+---------------+
only showing top 2 rows



In [112]:
dataframe.filter(dataframe.country.startswith('Ca')).show(2)

+---+---------+----------+------+---------------+--------+----+-------+---------------+
| id|user_name|    mobile|salary|user_created_at|    addr|city|country|address_created|
+---+---------+----------+------+---------------+--------+----+-------+---------------+
|  1|   taukir|8010339935| 50000|     2023-11-17|Richmond| B.C| Canada|     2023-11-17|
|  5|   zeenat|   1560489| 46000|     2023-11-17|Richmond| B.C| Canada|     2023-11-17|
+---+---------+----------+------+---------------+--------+----+-------+---------------+
only showing top 2 rows

