In [3]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285387 sha256=016d6ab7135be330cc89fb8f8a60b5b5aa1aa45f753352b14a9949a579ab1be1
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [4]:
import pyspark
from pyspark .sql import SparkSession

In [5]:
spark=SparkSession.builder.appName("classification").getOrCreate()

In [6]:
hrdata=spark.read.csv("/content/drive/MyDrive/train.csv",inferSchema=True,header=True)

In [7]:
hrdata.show(5)

+-----------+-----------------+---------+----------------+------+-------------------+---------------+---+--------------------+-----------------+-------------+-----------+------------------+-----------+
|employee_id|       department|   region|       education|gender|recruitment_channel|no_of_trainings|age|previous_year_rating|length_of_service|KPIs_met >80%|awards_won?|avg_training_score|is_promoted|
+-----------+-----------------+---------+----------------+------+-------------------+---------------+---+--------------------+-----------------+-------------+-----------+------------------+-----------+
|      65438|Sales & Marketing| region_7|Master's & above|     f|           sourcing|              1| 35|                   5|                8|            1|          0|                49|          0|
|      65141|       Operations|region_22|      Bachelor's|     m|              other|              1| 30|                   5|                4|            0|          0|                60|   

In [8]:
hrdata.groupBy('education').count().show()

+----------------+-----+
|       education|count|
+----------------+-----+
|            null| 2409|
| Below Secondary|  805|
|Master's & above|14925|
|      Bachelor's|36669|
+----------------+-----+



In [9]:
hrdata=hrdata.na.fill(value="Bachelor's",subset=['education'])

In [10]:
hrdata.groupBy('previous_year_rating').count().show()

+--------------------+-----+
|previous_year_rating|count|
+--------------------+-----+
|                null| 4124|
|                   1| 6223|
|                   3|18618|
|                   5|11741|
|                   4| 9877|
|                   2| 4225|
+--------------------+-----+



In [11]:
hrdata=hrdata.na.fill(value="Bachelor's",subset=['education'])

In [12]:
hrdata.groupBy('previous_year_rating').count().show()


+--------------------+-----+
|previous_year_rating|count|
+--------------------+-----+
|                null| 4124|
|                   1| 6223|
|                   3|18618|
|                   5|11741|
|                   4| 9877|
|                   2| 4225|
+--------------------+-----+



In [13]:
hrdata=hrdata.na.fill(value=3,subset=['previous_year_rating'])

In [14]:
hrdata.columns

['employee_id',
 'department',
 'region',
 'education',
 'gender',
 'recruitment_channel',
 'no_of_trainings',
 'age',
 'previous_year_rating',
 'length_of_service',
 'KPIs_met >80%',
 'awards_won?',
 'avg_training_score',
 'is_promoted']

In [17]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

In [18]:
indexer=[StringIndexer(inputCol=col,outputCol=col+"index").fit(hrdata)
for col in list(set(hrdata.columns)-set(['no_of_trainings','age',
                                         'length_of_service',
                                         'avg_training_score','employee_id']))]

In [19]:
pipeline=Pipeline(stages=indexer)

In [20]:
hrdatadf=pipeline.fit(hrdata).transform(hrdata)

In [21]:
hrdatadf.columns


['employee_id',
 'department',
 'region',
 'education',
 'gender',
 'recruitment_channel',
 'no_of_trainings',
 'age',
 'previous_year_rating',
 'length_of_service',
 'KPIs_met >80%',
 'awards_won?',
 'avg_training_score',
 'is_promoted',
 'is_promotedindex',
 'KPIs_met >80%index',
 'departmentindex',
 'previous_year_ratingindex',
 'genderindex',
 'educationindex',
 'recruitment_channelindex',
 'regionindex',
 'awards_won?index']

In [22]:
columnstodrop=['employee_id','deperment','region','education','gender',
               'recruitment_channel','previous_year_rating','KPIs_met >80%',
               'awards_won?','is_promoted']

In [23]:
hrdatadf=hrdatadf.drop(*columnstodrop)

In [24]:
from pyspark.ml.feature import RFormula

In [25]:
formula=RFormula(formula="is_promotedindex~.",featuresCol='features',
                 labelCol='label')

In [26]:
hrdatadf=formula.fit(hrdatadf).transform(hrdatadf)

In [27]:
from pyspark.ml.classification import LogisticRegression

In [38]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator,BinaryClassificationEvaluator

In [29]:
logit=LogisticRegression()

In [30]:

logitmodel=logit.fit(hrdatadf)

In [31]:
logitmodel.summary.accuracy

0.9325098525762663

In [32]:
logitpredict=logitmodel.transform(hrdatadf)

In [33]:
logitmodel.summary.areaUnderROC

0.8683153323222822

In [34]:
accuracy=MulticlassClassificationEvaluator(metricName="accuracy")


In [39]:
auc=BinaryClassificationEvaluator()

In [40]:
from pyspark.ml.classification import DecisionTreeClassifier

In [41]:
tree=DecisionTreeClassifier(maxBins=35)

In [42]:
treemodel=tree.fit(hrdatadf)

In [43]:
treepredict=treemodel.transform(hrdatadf)

In [44]:
accuracy.evaluate(treepredict)

0.926944971537002

In [45]:
auc.evaluate(treepredict)

0.5866679338981956

In [46]:
from pyspark.ml.classification import RandomForestClassifier

In [47]:
RF=RandomForestClassifier(maxBins=35)

In [48]:
RFmodel=RF.fit(hrdatadf)

In [49]:
RFpredict=RFmodel.transform(hrdatadf)

In [50]:
accuracy.evaluate(RFpredict)


0.9246095460516713

In [52]:
auc.evaluate(RFpredict)

0.8655182605243456

In [53]:
from pyspark.ml.classification import GBTClassifier

In [54]:
gbm=GBTClassifier(maxBins=35)

In [55]:
gbmmodel=gbm.fit(hrdatadf)

In [56]:
gbmpredict=gbmmodel.transform(hrdatadf)

In [57]:
auc.evaluate(gbmpredict)

0.9159323218040043

In [58]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [59]:
nn=MultilayerPerceptronClassifier(layers=[11,100,2])
# 11 - input , hidden layer 1 with 100 neurons and output layer 2

In [62]:
import os
import sys



In [64]:
os.environ['PYSPARK_PYTHON']=sys.executable
os.environ['PYSPARK_DRIVER_PYTHON']=sys.executable

In [65]:
nnmodel=nn.fit(hrdatadf)

In [66]:
nnpredict=nnmodel.transform(hrdatadf)