In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pandas as pd


In [4]:
spark = SparkSession.builder.appName("q3").getOrCreate()
filepath="Cleaned_DS_Jobs.csv"
df = spark.read.csv(filepath, inferSchema="True", header="True")
df.printSchema()
# df.show()

root
 |-- Job Title: string (nullable = true)
 |-- Salary Estimate: string (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Type of ownership: string (nullable = true)
 |-- Industry: string (nullable = true)
 |-- Sector: string (nullable = true)
 |-- job_state: string (nullable = true)
 |-- company_age: integer (nullable = true)
 |-- python: integer (nullable = true)
 |-- spark: integer (nullable = true)
 |-- tableau: integer (nullable = true)



In [5]:
# to check for null values
df.select([count(when(isnull(c) | isnan(c),c)).alias(c) for c in df.columns]).show()

+---------+---------------+------+--------+----+-----------------+--------+------+---------+-----------+------+-----+-------+
|Job Title|Salary Estimate|Rating|Location|Size|Type of ownership|Industry|Sector|job_state|company_age|python|spark|tableau|
+---------+---------------+------+--------+----+-----------------+--------+------+---------+-----------+------+-----+-------+
|        0|              0|     0|       0|  27|               27|      71|    71|        0|          0|     0|    0|      0|
+---------+---------------+------+--------+----+-----------------+--------+------+---------+-----------+------+-----+-------+



In [6]:
#1 Split the salary estimate column into two separate fields, min_salary and max_salary, 
# using PySpark to extract the minimum and maximum salary values from the range 
# provided. 
df = df.withColumn("minsalary",split(col("Salary Estimate"),"-")[0].cast("int"))
df = df.withColumn("maxsalary",split(col("Salary Estimate"),"-")[1].cast("int"))
df.printSchema()

root
 |-- Job Title: string (nullable = true)
 |-- Salary Estimate: string (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Type of ownership: string (nullable = true)
 |-- Industry: string (nullable = true)
 |-- Sector: string (nullable = true)
 |-- job_state: string (nullable = true)
 |-- company_age: integer (nullable = true)
 |-- python: integer (nullable = true)
 |-- spark: integer (nullable = true)
 |-- tableau: integer (nullable = true)
 |-- minsalary: integer (nullable = true)
 |-- maxsalary: integer (nullable = true)



In [21]:
#2
df = df.withColumn("average_salary", (col("minsalary") + col("maxsalary")) / 2)

In [19]:
#3
df = df.withColumn("Rating" , when(((col("Rating") == -1) | (col("Rating") == 0)) ,1).otherwise(col("Rating")))

In [20]:
#4
for column in df.columns:
    df = df.withColumn(column,when(isnull(col(column)) ,-1).otherwise(col(column)))

In [24]:
#5
ans = df.groupBy(trim("Job Title")).agg(avg("average_salary").alias("avg"))
ans.show()

+--------------------+-----------------+
|     trim(Job Title)|              avg|
+--------------------+-----------------+
|Senior Data Scien...|99.33333333333333|
|Clinical Data Ana...|            164.5|
|Senior Business I...|             90.0|
|Data Analyst/Engi...|            115.5|
|Staff BI and Data...|            107.0|
|Intelligence Data...|            90.75|
|Report Writer-Dat...|             92.5|
|Hydrogen/Tritium ...|            148.0|
|Business Intellig...|           109.25|
|        Data Modeler|            154.0|
|Scientist / Group...|            197.5|
|Senior Research S...|            105.0|
|Software Engineer...|            164.5|
|   Sr Data Scientist|           126.75|
|COMPUTER SCIENTIS...|            271.5|
|Data Scientist/Ma...|            125.5|
|Data Scientist - ...|            120.5|
|  Decision Scientist|             94.5|
|Data Scientist - ...|            97.75|
|Data Scientist / ...|            128.5|
+--------------------+-----------------+
only showing top

In [25]:
#6
import random
from pyspark.sql.types import StringType

values  = ["Small","Medium","Large"]
def assign():
    return random.choice(values)

my_udf = udf(assign,StringType())
df = df.withColumn("SizeValue",my_udf())
df = df.drop("Size")
# df.show()

+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+---------+---------+--------------+---------+
|           Job Title|Salary Estimate|Rating|         Location|   Type of ownership|            Industry|              Sector|job_state|company_age|python|spark|tableau|minsalary|maxsalary|average_salary|SizeValue|
+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+---------+---------+--------------+---------+
|   Sr Data Scientist|       137-171 |   3.1|     New York, NY|Nonprofit Organiz...|  Insurance Carriers|           Insurance|       NY|         27|     0|    0|      0|      137|      171|         154.0|   Medium|
|      Data Scientist|       137-171 |   4.2|    Chantilly, VA|    Company - Public|Research & Develo...|   Business Services|       VA|    

In [27]:
ans = df.groupBy("SizeValue").agg(avg("average_salary"))
ans.show()ta

+---------+-------------------+
|SizeValue|avg(average_salary)|
+---------+-------------------+
|   Medium| 122.22535211267606|
|    Small| 123.41463414634147|
|    Large|  125.5103305785124|
+---------+-------------------+

