In [2]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [4]:
spark = SparkSession.builder.appName("Question3").getOrCreate()

In [5]:
df = spark.read.csv("Cleaned_DS_Jobs.csv", header=True)

In [6]:
df.show()

+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+
|           Job Title|Salary Estimate|Rating|         Location|                Size|   Type of ownership|            Industry|              Sector|job_state|company_age|python|spark|tableau|
+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+
|   Sr Data Scientist|       137-171 |   3.1|     New York, NY|1001 to 5000 empl...|Nonprofit Organiz...|  Insurance Carriers|           Insurance|       NY|         27|     0|    0|      0|
|      Data Scientist|       137-171 |   4.2|    Chantilly, VA|5001 to 10000 emp...|    Company - Public|Research & Develo...|   Business Services|       VA|         52|     0|    0|      0|
|      Data Scientist|       137-171 |   3.8|

In [7]:
df = df.withColumn(
"min_salary", split(col("Salary Estimate"), "-").getItem(0).cast("int") # Extract min
).withColumn(
"max_salary", split(col("Salary Estimate"), "-").getItem(1).cast("int") # Extract max
)
df.show()

+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+----------+----------+
|           Job Title|Salary Estimate|Rating|         Location|                Size|   Type of ownership|            Industry|              Sector|job_state|company_age|python|spark|tableau|min_salary|max_salary|
+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+----------+----------+
|   Sr Data Scientist|       137-171 |   3.1|     New York, NY|1001 to 5000 empl...|Nonprofit Organiz...|  Insurance Carriers|           Insurance|       NY|         27|     0|    0|      0|       137|       171|
|      Data Scientist|       137-171 |   4.2|    Chantilly, VA|5001 to 10000 emp...|    Company - Public|Research & Develo...|   Business Services| 

In [8]:
# Assuming your DataFrame is named 'df_split_salary'
df = df.withColumn(
"average_salary", (col("min_salary") + col("max_salary")) / 2
)

In [9]:
df.show()

+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+----------+----------+--------------+
|           Job Title|Salary Estimate|Rating|         Location|                Size|   Type of ownership|            Industry|              Sector|job_state|company_age|python|spark|tableau|min_salary|max_salary|average_salary|
+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+----------+----------+--------------+
|   Sr Data Scientist|       137-171 |   3.1|     New York, NY|1001 to 5000 empl...|Nonprofit Organiz...|  Insurance Carriers|           Insurance|       NY|         27|     0|    0|      0|       137|       171|         154.0|
|      Data Scientist|       137-171 |   4.2|    Chantilly, VA|5001 to 10000 emp...|    

In [10]:
# Replace -1 and 0 in the Rating column with 1
df = df.withColumn(
"Rating",
when((col("Rating") == -1) | (col("Rating") == 0), 1).otherwise(col("Rating"))
)

In [11]:
df_transformed = df.fillna(-1)

In [16]:
df_grouped = df.groupBy("Job Title").agg(
avg("average_salary")
)
# Show the result
df_grouped.show()

+--------------------+-------------------+
|           Job Title|avg(average_salary)|
+--------------------+-------------------+
|Senior Data Scien...|  99.33333333333333|
|Clinical Data Ana...|              164.5|
|Senior Business I...|               90.0|
|Data Analyst/Engi...|              115.5|
|Staff BI and Data...|              107.0|
|Intelligence Data...|              90.75|
|Report Writer-Dat...|               92.5|
|Hydrogen/Tritium ...|              148.0|
|Business Intellig...|             109.25|
|        Data Modeler|              154.0|
|Scientist / Group...|              197.5|
|Senior Research S...|              105.0|
|Software Engineer...|              164.5|
|   Sr Data Scientist|             126.75|
|COMPUTER SCIENTIS...|              271.5|
|Data Scientist/Ma...|              125.5|
|Data Scientist - ...|              120.5|
|  Decision Scientist|               94.5|
|Data Scientist - ...|              97.75|
|Data Scientist / ...|              128.5|
+----------

In [18]:
df_avg_salary_by_size = df_transformed.groupBy("Size").agg(
avg("average_salary")
)
# Show the result
df_avg_salary_by_size.show()

+--------------------+-------------------+
|                Size|avg(average_salary)|
+--------------------+-------------------+
|5001 to 10000 emp...|  126.6639344262295|
|                NULL|  130.7962962962963|
|             Unknown| 143.38235294117646|
| 51 to 200 employees|       127.83203125|
|1001 to 5000 empl...|  121.7548076923077|
|501 to 1000 emplo...| 120.93506493506493|
|201 to 500 employees|  119.1987951807229|
|    10000+ employees| 122.80379746835443|
|   1 to 50 employees|              120.0|
+--------------------+-------------------+

