In [1]:
import findspark
findspark.init()

import pyspark 
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Classification").getOrCreate()

In [2]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import * 
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [3]:
df = spark.read.csv('f:\\datasets\\titanic.csv',inferSchema=True,header=True)
df.limit(6).toPandas()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SiblingsSpouses Aboard,parentschildren,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05
5,0,3,Mr. James Moran,male,27.0,0,0,8.4583


In [4]:
df.printSchema()

root
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SiblingsSpouses Aboard: integer (nullable = true)
 |-- parentschildren: integer (nullable = true)
 |-- Fare: double (nullable = true)



In [5]:
df.groupBy("Survived").count().show()

+--------+-----+
|Survived|count|
+--------+-----+
|       1|  342|
|       0|  545|
+--------+-----+



In [6]:
input_columns=df.columns
input_columns=input_columns[1:4:2]
dependent_var='Survived'
print(input_columns)
print(dependent_var)

['Pclass', 'Sex']
Survived


In [7]:
renamed=df.withColumn("label_str",df[dependent_var].cast('string'))
indexer=StringIndexer(inputCol='label_str',outputCol="label")
indexed=indexer.fit(renamed).transform(renamed)
indexed.show()

+--------+------+--------------------+------+----+----------------------+---------------+-------+---------+-----+
|Survived|Pclass|                Name|   Sex| Age|SiblingsSpouses Aboard|parentschildren|   Fare|label_str|label|
+--------+------+--------------------+------+----+----------------------+---------------+-------+---------+-----+
|       0|     3|Mr. Owen Harris B...|  male|22.0|                     1|              0|   7.25|        0|  0.0|
|       1|     1|Mrs. John Bradley...|female|38.0|                     1|              0|71.2833|        1|  1.0|
|       1|     3|Miss. Laina Heikk...|female|26.0|                     0|              0|  7.925|        1|  1.0|
|       1|     1|Mrs. Jacques Heat...|female|35.0|                     1|              0|   53.1|        1|  1.0|
|       0|     3|Mr. William Henry...|  male|35.0|                     0|              0|   8.05|        0|  0.0|
|       0|     3|     Mr. James Moran|  male|27.0|                     0|              0

In [8]:
indexed.printSchema()

root
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SiblingsSpouses Aboard: integer (nullable = true)
 |-- parentschildren: integer (nullable = true)
 |-- Fare: double (nullable = true)
 |-- label_str: string (nullable = true)
 |-- label: double (nullable = false)



In [9]:
numeric_inputs=[]
string_inputs=[]
for column in input_columns:
    if str(indexed.schema[column].dataType)=='StringType':
        indexer=StringIndexer(inputCol=column,outputCol=column+"_num")
        indexed=indexer.fit(indexed).transform(indexed)
        new_col_name=column+"_num"
        string_inputs.append(new_col_name)
    else:
        numeric_inputs.append(column)
print('numeric_inputs' , numeric_inputs)
print('String_inputs' , string_inputs)

numeric_inputs ['Pclass']
String_inputs ['Sex_num']


In [10]:
d = {}

for col in numeric_inputs: 
    d[col] = indexed.approxQuantile(col,[0.01,0.99],0.25) 

for col in numeric_inputs:
    skew = indexed.agg(skewness(indexed[col])).collect() 
    skew = skew[0][0]
   
    if skew > 1:
        indexed = indexed.withColumn(col, \
        log(when(df[col] < d[col][0],d[col][0])\
        .when(indexed[col] > d[col][1], d[col][1])\
        .otherwise(indexed[col] ) +1).alias(col))
        print(col+" has been treated for positive (right) skewness. (skew =)",skew,")")
    elif skew < -1:
        indexed = indexed.withColumn(col, \
        exp(when(df[col] < d[col][0],d[col][0])\
        .when(indexed[col] > d[col][1], d[col][1])\
        .otherwise(indexed[col] )).alias(col))
        print(col+" has been treated for negative (left) skewness. (skew =",skew,")")
        
print(skew)        

-0.6223541098616062


In [11]:
minimums = df.select([min(c).alias(c) for c in df.columns if c in numeric_inputs]) 

min_array = minimums.select(array(numeric_inputs).alias("mins")) 

df_minimum = min_array.select(array_min(min_array.mins)).collect() 

df_minimum = df_minimum[0][0] 


if df_minimum < 0:
    print("WARNING: The Naive Bayes Classifier will not be able to process your dataframe as it contains negative values")
else:
    print("No negative values were found in your dataframe.")

No negative values were found in your dataframe.


In [12]:
features_list = numeric_inputs + string_inputs
assembler = VectorAssembler(inputCols=features_list,outputCol='features')

output = assembler.transform(indexed).select('features','label')
output.show(5,False)

+---------+-----+
|features |label|
+---------+-----+
|[3.0,0.0]|0.0  |
|[1.0,1.0]|1.0  |
|[3.0,1.0]|1.0  |
|[1.0,1.0]|1.0  |
|[3.0,0.0]|0.0  |
+---------+-----+
only showing top 5 rows



In [13]:
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures",min=0,max=1000)
print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax()))


scalerModel = scaler.fit(output)


scaled_data = scalerModel.transform(output)
final_data = scaled_data.select('label','scaledFeatures')

final_data = final_data.withColumnRenamed("scaledFeatures","features")
final_data.show()

Features scaled to range: [0.000000, 1000.000000]
+-----+---------------+
|label|       features|
+-----+---------------+
|  0.0|   [1000.0,0.0]|
|  1.0|   [0.0,1000.0]|
|  1.0|[1000.0,1000.0]|
|  1.0|   [0.0,1000.0]|
|  0.0|   [1000.0,0.0]|
|  0.0|   [1000.0,0.0]|
|  0.0|      (2,[],[])|
|  0.0|   [1000.0,0.0]|
|  1.0|[1000.0,1000.0]|
|  1.0| [500.0,1000.0]|
|  1.0|[1000.0,1000.0]|
|  1.0|   [0.0,1000.0]|
|  0.0|   [1000.0,0.0]|
|  0.0|   [1000.0,0.0]|
|  0.0|[1000.0,1000.0]|
|  1.0| [500.0,1000.0]|
|  0.0|   [1000.0,0.0]|
|  1.0|    [500.0,0.0]|
|  0.0|[1000.0,1000.0]|
|  1.0|[1000.0,1000.0]|
+-----+---------------+
only showing top 20 rows



In [14]:
train,test = final_data.randomSplit([0.70,0.30])

In [15]:
naiveclassifier=NaiveBayes()
navieModel = naiveclassifier.fit(train)


In [16]:
predictions = navieModel.transform(test)

In [17]:
predictions.printSchema()
predictions.select('label','rawPrediction','probability','prediction').show()

root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)

+-----+--------------------+--------------------+----------+
|label|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+----------+
|  0.0|[-0.4696805796548...|[0.62520193861066...|       0.0|
|  0.0|[-0.4696805796548...|[0.62520193861066...|       0.0|
|  0.0|[-0.4696805796548...|[0.62520193861066...|       0.0|
|  0.0|[-0.4696805796548...|[0.62520193861066...|       0.0|
|  0.0|[-0.4696805796548...|[0.62520193861066...|       0.0|
|  0.0|[-0.4696805796548...|[0.62520193861066...|       0.0|
|  0.0|[-0.4696805796548...|[0.62520193861066...|       0.0|
|  0.0|[-0.4696805796548...|[0.62520193861066...|       0.0|
|  0.0|[-0.4696805796548...|[0.62520193861066...|       0.0|
|  0.0|[-0.4696805796548...|[0.62520193861066...|    

In [18]:
evaluator=BinaryClassificationEvaluator();
accuracy = evaluator.evaluate(predictions)
print("Accuracy of Model :" , accuracy)
print("Test Error of Model :" , 1-accuracy)

Accuracy of Model : 0.4055470564904527
Test Error of Model : 0.5944529435095474
