In [30]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as sql

# Pyspark session
spark = SparkSession.builder.appName("Test Pyspark Session").getOrCreate()
# Data from: https://www.kaggle.com/datasets/mohithsairamreddy/salary-data

In [35]:
# Load file
df = spark.read.csv("/content/Salary_Data.csv", header=True, inferSchema=True)

df = df.withColumnRenamed("Age", "AGE")\
      .withColumnRenamed("Gender", "GENDER")\
      .withColumnRenamed("Education Level", "EDUCATION_LEVEL")\
      .withColumnRenamed("Job Title", "JOB_TITLE")\
      .withColumnRenamed("Years of Experience", "YEARS_OF_EXPERIENCE")\
      .withColumnRenamed("Salary", "SALARY")

# Create view for SQL
df.createOrReplaceTempView("salaries")

# Show schema and sample
print(df.printSchema())
df.show(5)

root
 |-- AGE: integer (nullable = true)
 |-- GENDER: string (nullable = true)
 |-- EDUCATION_LEVEL: string (nullable = true)
 |-- JOB_TITLE: string (nullable = true)
 |-- YEARS_OF_EXPERIENCE: double (nullable = true)
 |-- SALARY: integer (nullable = true)

None
+---+------+---------------+-----------------+-------------------+------+
|AGE|GENDER|EDUCATION_LEVEL|        JOB_TITLE|YEARS_OF_EXPERIENCE|SALARY|
+---+------+---------------+-----------------+-------------------+------+
| 32|  Male|     Bachelor's|Software Engineer|                5.0| 90000|
| 28|Female|       Master's|     Data Analyst|                3.0| 65000|
| 45|  Male|            PhD|   Senior Manager|               15.0|150000|
| 36|Female|     Bachelor's|  Sales Associate|                7.0| 60000|
| 52|  Male|       Master's|         Director|               20.0|200000|
+---+------+---------------+-----------------+-------------------+------+
only showing top 5 rows



In [36]:
sql_str = '''
SELECT
  *
FROM salaries
WHERE
  GENDER = 'Male'
  AND YEARS_OF_EXPERIENCE > 1
ORDER BY SALARY DESC
'''

In [37]:
spark.sql(sql_str).show(10)

+---+------+-----------------+--------------------+-------------------+------+
|AGE|GENDER|  EDUCATION_LEVEL|           JOB_TITLE|YEARS_OF_EXPERIENCE|SALARY|
+---+------+-----------------+--------------------+-------------------+------+
| 50|  Male|       Bachelor's|                 CEO|               25.0|250000|
| 52|  Male|              PhD|Chief Technology ...|               24.0|250000|
| 45|  Male|Bachelor's Degree|   Financial Manager|               21.0|250000|
| 51|  Male|              PhD|      Data Scientist|               24.0|240000|
| 51|  Male|              PhD|      Data Scientist|               24.0|240000|
| 51|  Male|              PhD|      Data Scientist|               24.0|240000|
| 51|  Male|              PhD|      Data Scientist|               24.0|240000|
| 51|  Male|              PhD|      Data Scientist|               24.0|240000|
| 51|  Male|              PhD|      Data Scientist|               24.0|240000|
| 51|  Male|              PhD|      Data Scientist| 