In [4]:
import os
import sys

os.environ["JAVA_HOME"] = "JDK 8/Contents/Home"
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
import seaborn as sns
import matplotlib.pyplot as plt
import requests
import pandas as pd
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [5]:
spark = SparkSession.builder.appName("Assessment").getOrCreate()


path = "/Users/tomdursley/Downloads/HR-Employee-Attrition.csv" 

Emp_spark_df = spark.read.csv(path, header=True, inferSchema=True)

Emp_spark_df.show()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/15 01:47:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/02/15 01:47:30 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+---+---------+-----------------+---------+--------------------+----------------+---------+--------------+-------------+--------------+-----------------------+------+----------+--------------+--------+--------------------+---------------+-------------+-------------+-----------+------------------+------+--------+-----------------+-----------------+------------------------+-------------+----------------+-----------------+---------------------+---------------+--------------+------------------+-----------------------+--------------------+
|Age|Attrition|   BusinessTravel|DailyRate|          Department|DistanceFromHome|Education|EducationField|EmployeeCount|EmployeeNumber|EnvironmentSatisfaction|Gender|HourlyRate|JobInvolvement|JobLevel|             JobRole|JobSatisfaction|MaritalStatus|MonthlyIncome|MonthlyRate|NumCompaniesWorked|Over18|OverTime|PercentSalaryHike|PerformanceRating|RelationshipSatisfaction|StandardHours|StockOptionLevel|TotalWorkingYears|TrainingTimesLastYear|WorkLifeBalanc

In [6]:
Emp_spark_df = Emp_spark_df.drop("EmployeeCount", "StandardHours", "Over18", "EmployeeNumber")

Emp_spark_df.show()

+---+---------+-----------------+---------+--------------------+----------------+---------+--------------+-----------------------+------+----------+--------------+--------+--------------------+---------------+-------------+-------------+-----------+------------------+--------+-----------------+-----------------+------------------------+----------------+-----------------+---------------------+---------------+--------------+------------------+-----------------------+--------------------+
|Age|Attrition|   BusinessTravel|DailyRate|          Department|DistanceFromHome|Education|EducationField|EnvironmentSatisfaction|Gender|HourlyRate|JobInvolvement|JobLevel|             JobRole|JobSatisfaction|MaritalStatus|MonthlyIncome|MonthlyRate|NumCompaniesWorked|OverTime|PercentSalaryHike|PerformanceRating|RelationshipSatisfaction|StockOptionLevel|TotalWorkingYears|TrainingTimesLastYear|WorkLifeBalance|YearsAtCompany|YearsInCurrentRole|YearsSinceLastPromotion|YearsWithCurrManager|
+---+---------+---

In [9]:
Emp_spark_df = Emp_spark_df.withColumnRenamed("DailyRate", "Daily_Rate") \
                           .withColumnRenamed("MonthlyIncome", "Monthly_Income") \
                           .withColumnRenamed("TotalWorkingYears", "Total_Working_Years") \
                           .withColumnRenamed("YearsAtCompany", "Years_At_Company") \
                           .withColumnRenamed("YearsWithCurrManager", "Years_With_Current_Manager")

Emp_spark_df.show()

+---+---------+-----------------+----------+--------------------+----------------+---------+--------------+-----------------------+------+----------+--------------+--------+--------------------+---------------+-------------+--------------+-----------+------------------+--------+-----------------+-----------------+------------------------+----------------+-------------------+---------------------+---------------+----------------+------------------+-----------------------+--------------------------+
|Age|Attrition|   BusinessTravel|daily_rate|          Department|DistanceFromHome|Education|EducationField|EnvironmentSatisfaction|Gender|HourlyRate|JobInvolvement|JobLevel|             JobRole|JobSatisfaction|MaritalStatus|monthly_income|MonthlyRate|NumCompaniesWorked|OverTime|PercentSalaryHike|PerformanceRating|RelationshipSatisfaction|StockOptionLevel|total_working_years|TrainingTimesLastYear|WorkLifeBalance|years_at_company|YearsInCurrentRole|YearsSinceLastPromotion|years_with_current_man

In [10]:
from pyspark.sql.functions import col,isnan, when, count

In [11]:
Emp_spark_df.select([count(when(col(c).isNull(), c)).alias(c) for c in Emp_spark_df.columns]).show()

+---+---------+--------------+----------+----------+----------------+---------+--------------+-----------------------+------+----------+--------------+--------+-------+---------------+-------------+--------------+-----------+------------------+--------+-----------------+-----------------+------------------------+----------------+-------------------+---------------------+---------------+----------------+------------------+-----------------------+--------------------------+
|Age|Attrition|BusinessTravel|daily_rate|Department|DistanceFromHome|Education|EducationField|EnvironmentSatisfaction|Gender|HourlyRate|JobInvolvement|JobLevel|JobRole|JobSatisfaction|MaritalStatus|monthly_income|MonthlyRate|NumCompaniesWorked|OverTime|PercentSalaryHike|PerformanceRating|RelationshipSatisfaction|StockOptionLevel|total_working_years|TrainingTimesLastYear|WorkLifeBalance|years_at_company|YearsInCurrentRole|YearsSinceLastPromotion|years_with_current_manager|
+---+---------+--------------+----------+-----

In [14]:
Emp_spark_df = Emp_spark_df.dropDuplicates()

Emp_spark_df.show()

+---+---------+-----------------+----------+--------------------+----------------+---------+----------------+-----------------------+------+----------+--------------+--------+--------------------+---------------+-------------+--------------+-----------+------------------+--------+-----------------+-----------------+------------------------+----------------+-------------------+---------------------+---------------+----------------+------------------+-----------------------+--------------------------+
|Age|Attrition|   BusinessTravel|daily_rate|          Department|DistanceFromHome|Education|  EducationField|EnvironmentSatisfaction|Gender|HourlyRate|JobInvolvement|JobLevel|             JobRole|JobSatisfaction|MaritalStatus|monthly_income|MonthlyRate|NumCompaniesWorked|OverTime|PercentSalaryHike|PerformanceRating|RelationshipSatisfaction|StockOptionLevel|total_working_years|TrainingTimesLastYear|WorkLifeBalance|years_at_company|YearsInCurrentRole|YearsSinceLastPromotion|years_with_current

In [19]:
pandas_df = Emp_spark_df.toPandas()

In [20]:
pandas_df.to_csv("employee_data_csv.csv")

In [30]:

db_url = "jdbc:postgresql://localhost:5432/hr_db"
db_properties = {
    "user": "postgres",
    "password": "cd090264",
    "driver": "org.postgresql.Driver"
}


Emp_spark_df.write.jdbc(
    url=db_url, 
    table="employee_data", 
    mode="overwrite",  
    properties=db_properties
)


                                                                                

In [31]:
Emp_spark_df.show()

+---+---------+-----------------+----------+--------------------+----------------+---------+----------------+-----------------------+------+----------+--------------+--------+--------------------+---------------+-------------+--------------+-----------+------------------+--------+-----------------+-----------------+------------------------+----------------+-------------------+---------------------+---------------+----------------+------------------+-----------------------+--------------------------+
|Age|Attrition|   BusinessTravel|daily_rate|          Department|DistanceFromHome|Education|  EducationField|EnvironmentSatisfaction|Gender|HourlyRate|JobInvolvement|JobLevel|             JobRole|JobSatisfaction|MaritalStatus|monthly_income|MonthlyRate|NumCompaniesWorked|OverTime|PercentSalaryHike|PerformanceRating|RelationshipSatisfaction|StockOptionLevel|total_working_years|TrainingTimesLastYear|WorkLifeBalance|years_at_company|YearsInCurrentRole|YearsSinceLastPromotion|years_with_current

In [32]:
spark.sql("SELECT AVG(Age) AS avg_age FROM employee_view").show()


+------------------+
|           avg_age|
+------------------+
|36.923809523809524|
+------------------+



In [41]:
spark.sql("SELECT Department, COUNT(*) AS count \
    FROM employee_view \
    GROUP BY Department \
    ORDER BY count DESC ").show()


+--------------------+-----+
|          Department|count|
+--------------------+-----+
|Research & Develo...|  961|
|               Sales|  446|
|     Human Resources|   63|
+--------------------+-----+



In [37]:
spark.sql("SELECT PERCENTILE_APPROX(DistanceFromHome, 0.5) AS median_distance \
    FROM employee_view").show()


+---------------+
|median_distance|
+---------------+
|              7|
+---------------+



In [38]:
Emp_spark_df = Emp_spark_df.withColumnRenamed("DistanceFromHome", "DistanceFromHome_(km)")

Emp_spark_df.show()

+---+---------+-----------------+----------+--------------------+---------------------+---------+----------------+-----------------------+------+----------+--------------+--------+--------------------+---------------+-------------+--------------+-----------+------------------+--------+-----------------+-----------------+------------------------+----------------+-------------------+---------------------+---------------+----------------+------------------+-----------------------+--------------------------+
|Age|Attrition|   BusinessTravel|daily_rate|          Department|DistanceFromHome_(km)|Education|  EducationField|EnvironmentSatisfaction|Gender|HourlyRate|JobInvolvement|JobLevel|             JobRole|JobSatisfaction|MaritalStatus|monthly_income|MonthlyRate|NumCompaniesWorked|OverTime|PercentSalaryHike|PerformanceRating|RelationshipSatisfaction|StockOptionLevel|total_working_years|TrainingTimesLastYear|WorkLifeBalance|years_at_company|YearsInCurrentRole|YearsSinceLastPromotion|years_wi

In [40]:
spark.sql("SELECT Education, COUNT(*) AS count \
    FROM employee_view \
    GROUP BY Education \
    ORDER BY count DESC").show()


+---------+-----+
|Education|count|
+---------+-----+
|        3|  572|
|        4|  398|
|        2|  282|
|        1|  170|
|        5|   48|
+---------+-----+

