In [9]:
import sys
from pyspark.sql import SparkSession

In [10]:
spark = SparkSession.builder.master("local[1]").appName("spark-app-version-x").getOrCreate()

In [11]:
local_file = r'C:\Users\Yunis\Desktop\yellow_tripdata_2024-01.parquet'
df=spark.read.parquet(local_file)
df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- Airport_fee: double (nullable = true)



In [12]:
df.select('VendorID','tpep_pickup_datetime','total_amount').where('total_amount>1').show(n=10)

+--------+--------------------+------------+
|VendorID|tpep_pickup_datetime|total_amount|
+--------+--------------------+------------+
|       2| 2024-01-01 00:57:55|        22.7|
|       1| 2024-01-01 00:03:00|       18.75|
|       1| 2024-01-01 00:17:06|        31.3|
|       1| 2024-01-01 00:36:38|        17.0|
|       1| 2024-01-01 00:46:51|        16.1|
|       1| 2024-01-01 00:54:08|        41.5|
|       2| 2024-01-01 00:49:44|       64.95|
|       1| 2024-01-01 00:30:40|        30.4|
|       2| 2024-01-01 00:26:01|        36.0|
|       2| 2024-01-01 00:28:08|         8.0|
+--------+--------------------+------------+
only showing top 10 rows



In [13]:
df.createOrReplaceTempView('yun')

In [14]:
spark.sql("select * from yun where total_amount>3").show(n=5)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|Airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       2| 2024-01-01 00:57:55|  2024-01-01 01:17:43|              1|         1.72|         1|                 N|         186|          79|           2|       17.7|  1.0|    0.5|       0.

In [15]:
spark.stop()

In [1]:
import logging
from pyspark.sql import SparkSession

In [2]:
def rdd_to_dataframe(data, schema):
    """
    Example: This fn creates a Spark RDD, loads it into a Spark DataFrame, and returns the DataFrame 
    """
        
    # Create a SparkSession
    spark = SparkSession.builder.appName("RDDToDataFrame").getOrCreate()

    try:
        # Create an RDD from the input data, using Spark Context not Session!
        rdd = spark.sparkContext.parallelize(data)

        # Convert RDD to DataFrame
        df = spark.createDataFrame(rdd, schema)

        # Return the DataFrame, without stopping the SparkSession
        return df

    except Exception as e:
        # Log error and Stop the SparkSession
        logging.error('Error while transforming RDD to DF: {}'.format(e))
        spark.stop()


In [3]:
dept_data = [(1,"Big Data"), (2, "Finance"), (3,"Marketing")]
dept_schema = ["department_id", "department_name"]

In [4]:
emp_data = [(1,"Carlos", 17), (1,"Bob", 30), (2,"Jasmin", 26)]
emp_schema = ["department_id","employee_name", "age"]

In [5]:
df_emp = rdd_to_dataframe(emp_data, emp_schema)
df_dept = rdd_to_dataframe(dept_data, dept_schema)

In [6]:
df_dept.show()

+-------------+---------------+
|department_id|department_name|
+-------------+---------------+
|            1|       Big Data|
|            2|        Finance|
|            3|      Marketing|
+-------------+---------------+



In [7]:
spark=SparkSession.builder.appName("RDDToDataFrame").getOrCreate()

In [8]:
df_emp.createOrReplaceTempView('employees')
df_dept.createOrReplaceTempView('departments')

In [9]:
spark.sql('''select emp.*,dp.* from employees as emp join departments as dp on(emp.department_id=dp.department_id) where age>=18''').show()

+-------------+-------------+---+-------------+---------------+
|department_id|employee_name|age|department_id|department_name|
+-------------+-------------+---+-------------+---------------+
|            1|          Bob| 30|            1|       Big Data|
|            2|       Jasmin| 26|            2|        Finance|
+-------------+-------------+---+-------------+---------------+



In [10]:
spark.sql('''select emp.employee_name,emp.age,emp.department_id,dp.department_name from employees as emp join departments as dp on (emp.department_id=dp.department_id)''').createOrReplaceTempView('dp_employees')

In [11]:
spark.sql('''select * from dp_employees''').show()

+-------------+---+-------------+---------------+
|employee_name|age|department_id|department_name|
+-------------+---+-------------+---------------+
|       Carlos| 17|            1|       Big Data|
|          Bob| 30|            1|       Big Data|
|       Jasmin| 26|            2|        Finance|
+-------------+---+-------------+---------------+

