In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [4]:
import pandas as pd

In [5]:
spark = SparkSession.builder.appName("m1").getOrCreate()
filepath = "Cleaned_DS_Jobs.csv"
df = spark.read.csv(filepath, header=True,inferSchema=True)
df.show()
df.printSchema()

+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+
|           Job Title|Salary Estimate|Rating|         Location|                Size|   Type of ownership|            Industry|              Sector|job_state|company_age|python|spark|tableau|
+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+
|   Sr Data Scientist|       137-171 |   3.1|     New York, NY|1001 to 5000 empl...|Nonprofit Organiz...|  Insurance Carriers|           Insurance|       NY|         27|     0|    0|      0|
|      Data Scientist|       137-171 |   4.2|    Chantilly, VA|5001 to 10000 emp...|    Company - Public|Research & Develo...|   Business Services|       VA|         52|     0|    0|      0|
|      Data Scientist|       137-171 |   3.8|

In [6]:
df.select([count(when(isnull(c) | isnan(c),c)).alias(c) for c in df.columns]).show()
missing_values = df.select([count(when(isnull(c), c)).alias(c) for c in df.columns])
missing_values.show()


+---------+---------------+------+--------+----+-----------------+--------+------+---------+-----------+------+-----+-------+
|Job Title|Salary Estimate|Rating|Location|Size|Type of ownership|Industry|Sector|job_state|company_age|python|spark|tableau|
+---------+---------------+------+--------+----+-----------------+--------+------+---------+-----------+------+-----+-------+
|        0|              0|     0|       0|  27|               27|      71|    71|        0|          0|     0|    0|      0|
+---------+---------------+------+--------+----+-----------------+--------+------+---------+-----------+------+-----+-------+

+---------+---------------+------+--------+----+-----------------+--------+------+---------+-----------+------+-----+-------+
|Job Title|Salary Estimate|Rating|Location|Size|Type of ownership|Industry|Sector|job_state|company_age|python|spark|tableau|
+---------+---------------+------+--------+----+-----------------+--------+------+---------+-----------+------+-----+

In [7]:
# 1. Split the salary estimate column into two separate fields (min_salary, max_salary)
# Assuming the salary column is named 'salary_estimate' and looks like "min-max"
df = df.withColumn("min_salary", split(col("Salary Estimate"), "-")[0].cast("int"))
df = df.withColumn("max_salary", split(col("Salary Estimate"), "-")[1].cast("int"))

In [8]:
# df.show()

In [9]:
#2
df = df.withColumn("average_salary", (col("min_salary") + col("max_salary")) / 2)

In [10]:
#3
df = df.withColumn("rating", when((col("rating") == -1) | (col("rating") == 0), 1).otherwise(col("rating")))

In [11]:
df.show()

+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+----------+----------+--------------+
|           Job Title|Salary Estimate|rating|         Location|                Size|   Type of ownership|            Industry|              Sector|job_state|company_age|python|spark|tableau|min_salary|max_salary|average_salary|
+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+----------+----------+--------------+
|   Sr Data Scientist|       137-171 |   3.1|     New York, NY|1001 to 5000 empl...|Nonprofit Organiz...|  Insurance Carriers|           Insurance|       NY|         27|     0|    0|      0|       137|       171|         154.0|
|      Data Scientist|       137-171 |   4.2|    Chantilly, VA|5001 to 10000 emp...|    

In [12]:
#4th
for column in df.columns:
    df = df.withColumn(column, when(col(column).isNull(), -1).otherwise(col(column)))

In [13]:
df.select([count(when(isnull(c) | isnan(c),c)).alias(c) for c in df.columns]).show()



+---------+---------------+------+--------+----+-----------------+--------+------+---------+-----------+------+-----+-------+----------+----------+--------------+
|Job Title|Salary Estimate|rating|Location|Size|Type of ownership|Industry|Sector|job_state|company_age|python|spark|tableau|min_salary|max_salary|average_salary|
+---------+---------------+------+--------+----+-----------------+--------+------+---------+-----------+------+-----+-------+----------+----------+--------------+
|        0|              0|     0|       0|   0|                0|       0|     0|        0|          0|     0|    0|      0|         0|         0|             0|
+---------+---------------+------+--------+----+-----------------+--------+------+---------+-----------+------+-----+-------+----------+----------+--------------+



In [14]:
df.printSchema()

root
 |-- Job Title: string (nullable = true)
 |-- Salary Estimate: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Type of ownership: string (nullable = true)
 |-- Industry: string (nullable = true)
 |-- Sector: string (nullable = true)
 |-- job_state: string (nullable = true)
 |-- company_age: integer (nullable = true)
 |-- python: integer (nullable = true)
 |-- spark: integer (nullable = true)
 |-- tableau: integer (nullable = true)
 |-- min_salary: integer (nullable = true)
 |-- max_salary: integer (nullable = true)
 |-- average_salary: double (nullable = true)



In [15]:
# #5th
df1=df.groupBy('Job Title').agg(avg('average_salary'))
df1.show()

+--------------------+-------------------+
|           Job Title|avg(average_salary)|
+--------------------+-------------------+
|Senior Data Scien...|  99.33333333333333|
|Clinical Data Ana...|              164.5|
|Senior Business I...|               90.0|
|Data Analyst/Engi...|              115.5|
|Staff BI and Data...|              107.0|
|Intelligence Data...|              90.75|
|Report Writer-Dat...|               92.5|
|Hydrogen/Tritium ...|              148.0|
|Business Intellig...|             109.25|
|        Data Modeler|              154.0|
|Scientist / Group...|              197.5|
|Senior Research S...|              105.0|
|Software Engineer...|              164.5|
|   Sr Data Scientist|             126.75|
|COMPUTER SCIENTIS...|              271.5|
|Data Scientist/Ma...|              125.5|
|Data Scientist - ...|              120.5|
|  Decision Scientist|               94.5|
|Data Scientist - ...|              97.75|
|Data Scientist / ...|              128.5|
+----------

In [16]:
#6th
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import random
values = ["Small", "Medium", "Large"]

def assign():
    return random.choice(values)

my_udf = udf(assign, StringType())
df = df.withColumn("SizeValue", my_udf())
df = df.drop("Size")

df.show()

+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+----------+----------+--------------+---------+
|           Job Title|Salary Estimate|rating|         Location|   Type of ownership|            Industry|              Sector|job_state|company_age|python|spark|tableau|min_salary|max_salary|average_salary|SizeValue|
+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+----------+----------+--------------+---------+
|   Sr Data Scientist|       137-171 |   3.1|     New York, NY|Nonprofit Organiz...|  Insurance Carriers|           Insurance|       NY|         27|     0|    0|      0|       137|       171|         154.0|    Large|
|      Data Scientist|       137-171 |   4.2|    Chantilly, VA|    Company - Public|Research & Develo...|   Business Services|      

In [17]:
df1=df.groupBy('SizeValue').agg(avg('average_salary'))
df1.show()

+---------+-------------------+
|SizeValue|avg(average_salary)|
+---------+-------------------+
|   Medium| 122.40375586854461|
|    Small| 124.71615720524018|
|    Large| 124.19954128440367|
+---------+-------------------+



In [18]:
pandas_df = df.toPandas()
pandas_df.to_csv(filepath, index=False)

In [19]:
# pandas_df.isnull().sum()

In [20]:
df.show()

+--------------------+---------------+------+-----------------+--------------------+--------------------+------+---------+-----------+------+-----+-------+----------+----------+--------------+---------+
|           Job Title|Salary Estimate|rating|         Location|   Type of ownership|            Industry|Sector|job_state|company_age|python|spark|tableau|min_salary|max_salary|average_salary|SizeValue|
+--------------------+---------------+------+-----------------+--------------------+--------------------+------+---------+-----------+------+-----+-------+----------+----------+--------------+---------+
|   Sr Data Scientist|       137-171 |   3.1|     New York, NY|  Insurance Carriers|           Insurance|    NY|       27|          0|     0|    0|    137|       137|       171|         154.0|    Small|
|      Data Scientist|       137-171 |   4.2|    Chantilly, VA|Research & Develo...|   Business Services|    VA|       52|          0|     0|    0|    137|       137|       171|         15