In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [3]:
import pandas as pd

In [4]:
spark = SparkSession.builder.appName("mg").getOrCreate()
filepath = "Cleaned_DS_Jobs.csv"
df = spark.read.csv(filepath, header=True,inferSchema=True)
df.show()
df.printSchema()

+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+
|           Job Title|Salary Estimate|Rating|         Location|                Size|   Type of ownership|            Industry|              Sector|job_state|company_age|python|spark|tableau|
+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+
|   Sr Data Scientist|       137-171 |   3.1|     New York, NY|1001 to 5000 empl...|Nonprofit Organiz...|  Insurance Carriers|           Insurance|       NY|         27|     0|    0|      0|
|      Data Scientist|       137-171 |   4.2|    Chantilly, VA|5001 to 10000 emp...|    Company - Public|Research & Develo...|   Business Services|       VA|         52|     0|    0|      0|
|      Data Scientist|       137-171 |   3.8|

In [9]:
df.select([count(when(isnull(c) | isnan(c),c)).alias(c) for c in df.columns]).show()
missing_values = df.select([count(when(isnull(c), c)).alias(c) for c in df.columns])
missing_values.show()


+---------+---------------+------+--------+----+-----------------+--------+------+---------+-----------+------+-----+-------+
|Job Title|Salary Estimate|Rating|Location|Size|Type of ownership|Industry|Sector|job_state|company_age|python|spark|tableau|
+---------+---------------+------+--------+----+-----------------+--------+------+---------+-----------+------+-----+-------+
|        0|              0|     0|       0|  27|               27|      71|    71|        0|          0|     0|    0|      0|
+---------+---------------+------+--------+----+-----------------+--------+------+---------+-----------+------+-----+-------+

+---------+---------------+------+--------+----+-----------------+--------+------+---------+-----------+------+-----+-------+
|Job Title|Salary Estimate|Rating|Location|Size|Type of ownership|Industry|Sector|job_state|company_age|python|spark|tableau|
+---------+---------------+------+--------+----+-----------------+--------+------+---------+-----------+------+-----+

In [11]:
# 1. Split the salary estimate column into two separate fields (min_salary, max_salary)
# Assuming the salary column is named 'salary_estimate' and looks like "min-max"
df = df.withColumn("min_salary", split(col("Salary Estimate"), "-")[0].cast("int"))
df = df.withColumn("max_salary", split(col("Salary Estimate"), "-")[1].cast("int"))

In [12]:
df.show()

+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+----------+----------+
|           Job Title|Salary Estimate|Rating|         Location|                Size|   Type of ownership|            Industry|              Sector|job_state|company_age|python|spark|tableau|min_salary|max_salary|
+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+----------+----------+
|   Sr Data Scientist|       137-171 |   3.1|     New York, NY|1001 to 5000 empl...|Nonprofit Organiz...|  Insurance Carriers|           Insurance|       NY|         27|     0|    0|      0|       137|       171|
|      Data Scientist|       137-171 |   4.2|    Chantilly, VA|5001 to 10000 emp...|    Company - Public|Research & Develo...|   Business Services| 

In [13]:
#2
df = df.withColumn("average_salary", (col("min_salary") + col("max_salary")) / 2)

In [14]:
#3
df = df.withColumn("rating", when((col("rating") == -1) | (col("rating") == 0), 1).otherwise(col("rating")))

In [15]:
df.show()

+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+----------+----------+--------------+
|           Job Title|Salary Estimate|rating|         Location|                Size|   Type of ownership|            Industry|              Sector|job_state|company_age|python|spark|tableau|min_salary|max_salary|average_salary|
+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+----------+----------+--------------+
|   Sr Data Scientist|       137-171 |   3.1|     New York, NY|1001 to 5000 empl...|Nonprofit Organiz...|  Insurance Carriers|           Insurance|       NY|         27|     0|    0|      0|       137|       171|         154.0|
|      Data Scientist|       137-171 |   4.2|    Chantilly, VA|5001 to 10000 emp...|    

In [16]:
null_columns = [col_name for col_name in df.columns if df.filter(col(col_name).isNull()).count() > 0]
for column in null_columns:
    df = df.withColumn(column, when(col(column).isNull(), -1).otherwise(col(column)))

In [17]:
df.select([count(when(isnull(c) | isnan(c),c)).alias(c) for c in df.columns]).show()
missing_values = df.select([count(when(isnull(c), c)).alias(c) for c in df.columns])
missing_values.show()


+---------+---------------+------+--------+----+-----------------+--------+------+---------+-----------+------+-----+-------+----------+----------+--------------+
|Job Title|Salary Estimate|rating|Location|Size|Type of ownership|Industry|Sector|job_state|company_age|python|spark|tableau|min_salary|max_salary|average_salary|
+---------+---------------+------+--------+----+-----------------+--------+------+---------+-----------+------+-----+-------+----------+----------+--------------+
|        0|              0|     0|       0|   0|                0|       0|     0|        0|          0|     0|    0|      0|         0|         0|             0|
+---------+---------------+------+--------+----+-----------------+--------+------+---------+-----------+------+-----+-------+----------+----------+--------------+

+---------+---------------+------+--------+----+-----------------+--------+------+---------+-----------+------+-----+-------+----------+----------+--------------+
|Job Title|Salary Est

In [19]:
df.printSchema()

root
 |-- Job Title: string (nullable = true)
 |-- Salary Estimate: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Type of ownership: string (nullable = true)
 |-- Industry: string (nullable = true)
 |-- Sector: string (nullable = true)
 |-- job_state: string (nullable = true)
 |-- company_age: integer (nullable = true)
 |-- python: integer (nullable = true)
 |-- spark: integer (nullable = true)
 |-- tableau: integer (nullable = true)
 |-- min_salary: integer (nullable = true)
 |-- max_salary: integer (nullable = true)
 |-- average_salary: double (nullable = true)



In [21]:
pandas_df = df.toPandas()
pandas_df.to_csv(filepath, index=False)

In [24]:
pandas_df.isnull().sum()

Job Title                  0
Salary Estimate            0
rating                     0
Location                   0
Size                       0
Type of ownership          0
Industry                   0
Sector                     0
job_state                  0
company_age                0
python                     0
spark                      0
tableau                    0
min_salary                 0
max_salary                 0
average_salary             0
min_employees             96
max_employees            660
company_size_category      0
dtype: int64