<a href="https://colab.research.google.com/github/vaasu14/pyspark/blob/main/LR_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [2]:
!wget -q https://www-us.apache.org/dist/spark/spark-3.1.2//spark-3.1.2-bin-hadoop3.2.tgz  

In [3]:
!tar xf spark-3.1.2-bin-hadoop3.2.tgz

In [4]:
cd /content/spark-3.1.2-bin-hadoop3.2

/content/spark-3.1.2-bin-hadoop3.2


In [5]:
!pip install -q findspark

In [6]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"

In [7]:
import findspark
findspark.init()

In [8]:
findspark.find()

'/content/spark-3.1.2-bin-hadoop3.2'

In [9]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('pyspark').getOrCreate()

In [10]:
cd /content/spark-3.1.2-bin-hadoop3.2

/content/spark-3.1.2-bin-hadoop3.2


In [11]:
from google.colab import files
df=files.upload()

Saving HR_comma_sep.csv to HR_comma_sep.csv


In [12]:
data=spark.read.csv('/content/spark-3.1.2-bin-hadoop3.2/HR_comma_sep.csv',inferSchema=True,header=True)

In [14]:
from pyspark.sql.types import FloatType
data = data.withColumn("left",data["left"].cast(FloatType()))

In [15]:
data.printSchema()

root
 |-- satisfaction_level: double (nullable = true)
 |-- last_evaluation: double (nullable = true)
 |-- number_project: integer (nullable = true)
 |-- average_montly_hours: integer (nullable = true)
 |-- time_spend_company: integer (nullable = true)
 |-- Work_accident: integer (nullable = true)
 |-- left: float (nullable = true)
 |-- promotion_last_5years: integer (nullable = true)
 |-- Department: string (nullable = true)
 |-- salary: string (nullable = true)



In [16]:
data.head(1)

[Row(satisfaction_level=0.38, last_evaluation=0.53, number_project=2, average_montly_hours=157, time_spend_company=3, Work_accident=0, left=1.0, promotion_last_5years=0, Department='sales', salary='low')]

In [17]:
data.describe().show()

+-------+-------------------+-------------------+------------------+--------------------+------------------+-------------------+-------------------+---------------------+----------+------+
|summary| satisfaction_level|    last_evaluation|    number_project|average_montly_hours|time_spend_company|      Work_accident|               left|promotion_last_5years|Department|salary|
+-------+-------------------+-------------------+------------------+--------------------+------------------+-------------------+-------------------+---------------------+----------+------+
|  count|              14999|              14999|             14999|               14999|             14999|              14999|              14999|                14999|     14999| 14999|
|   mean| 0.6128335222348166| 0.7161017401159978|  3.80305353690246|   201.0503366891126| 3.498233215547703| 0.1446096406427095| 0.2380825388359224| 0.021268084538969265|      null|  null|
| stddev|0.24863065106114257|0.17116911062327556|1.2325

In [18]:
data.columns

['satisfaction_level',
 'last_evaluation',
 'number_project',
 'average_montly_hours',
 'time_spend_company',
 'Work_accident',
 'left',
 'promotion_last_5years',
 'Department',
 'salary']

In [19]:
data.select(data['Department']).distinct().show()

+-----------+
| Department|
+-----------+
| management|
|product_mng|
|  marketing|
|      sales|
|         hr|
| accounting|
|    support|
|         IT|
|  technical|
|      RandD|
+-----------+



In [20]:
cols=data.select(['satisfaction_level',
 'last_evaluation','left',
 'number_project',
 'average_montly_hours',
 'time_spend_company',
 'Work_accident',
 'promotion_last_5years',
 'Department',
 'salary'])

In [21]:
final_data=cols.na.drop()

In [22]:
data.count()

14999

In [23]:
from pyspark.ml.feature import (VectorAssembler,VectorAssembler,StringIndexer,OneHotEncoder)

In [24]:
department_indexer=StringIndexer(inputCol='Department',outputCol='DepartmentIndex')

In [25]:
department_encoder=OneHotEncoder(inputCol='DepartmentIndex',outputCol='DeparmentEncoder')

In [26]:
salary_indexer=StringIndexer(inputCol='salary',outputCol='salaryIndex')
salary_encoder=OneHotEncoder(inputCol='salaryIndex',outputCol='salaryEncoder')

In [27]:
assembler=VectorAssembler(inputCols=['satisfaction_level','last_evaluation','number_project','average_montly_hours','time_spend_company','Work_accident',
                                     'promotion_last_5years','DeparmentEncoder','salaryEncoder'],outputCol='features')

In [28]:
from pyspark.ml.classification import LogisticRegression

In [29]:
from pyspark.ml import Pipeline

In [30]:
lr_model=LogisticRegression(featuresCol='features',labelCol='left')

In [31]:
pipeline= Pipeline(stages=[department_indexer,department_encoder,salary_indexer,salary_encoder,assembler,lr_model])

In [32]:
train_data,test_data=final_data.randomSplit([0.7,0.3])

In [33]:
fit_model= pipeline.fit(train_data)

In [34]:
results=fit_model.transform(test_data)

In [35]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [36]:
my_eval=BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='left')

In [37]:
data.select(data['left']).distinct().show()

+----+
|left|
+----+
| 1.0|
| 0.0|
+----+



In [38]:
results.select('left','prediction').distinct().show()

+----+----------+
|left|prediction|
+----+----------+
| 1.0|       0.0|
| 0.0|       0.0|
| 1.0|       1.0|
| 0.0|       1.0|
+----+----------+



In [39]:
AUC=my_eval.evaluate(results)

In [40]:
AUC

0.6389776430765983