In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.0-preview2/spark-3.0.0-preview2-bin-hadoop2.7.tgz
!tar -xvf spark-3.0.0-preview2-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-preview2-bin-hadoop2.7"
import findspark
findspark.init()


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when 
from pyspark.ml.feature import VectorAssembler 
from pyspark.ml.feature import StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
train_data = spark.read.option("inferSchema", "true").csv("Planet_Training.csv", header=True)
test_data = spark.read.option("inferSchema", "true").csv("Planet_Testing.csv", header=True)

In [None]:
print(test_data.show())

In [None]:
train_data = train_data.select("Temperature", "Water", "Atmosphere Color", "Habitable")
test_data = test_data.select("Temperature", "Water", "Atmosphere Color", "Habitable")

In [None]:
train_data = train_data.na.drop()
test_data = test_data.na.drop()

In [None]:
print(train_data.show())

+-----------+------+----------------+---------+
|Temperature| Water|Atmosphere Color|Habitable|
+-----------+------+----------------+---------+
|     323488|Medium|          Yellow|        1|
|     319279|   Low|          Yellow|        1|
|     315375|   Low|          Yellow|        1|
|     302312|Medium|          Yellow|        1|
|     329687|   Low|          Yellow|        1|
|     265746|  High|             Red|        0|
|     305214|  High|          Yellow|        1|
|     299936|  High|          Yellow|        0|
|     269577|Medium|             Red|        1|
|     303631|  High|             Red|        0|
|     290051|  High|             Red|        0|
|     306122|   Low|          Yellow|        1|
|     300635|   Low|          Yellow|        1|
|     312152|  High|            Blue|        0|
|     265942|Medium|            Blue|        0|
|     307368|  High|             Red|        0|
|     276274|Medium|          Yellow|        1|
|     308531|Medium|          Yellow|   

In [None]:
print(train_data.toPandas().isna().sum())

Temperature         0
Water               0
Atmosphere Color    0
Habitable           0
dtype: int64


In [None]:
def transform_data(data):
  #mapping 
  #low = 0 , med = 1, high = 2
  #red = 0, blue = 1, yellow = 2
  data = data.withColumn("Water", when(data["Water"] == "Low", 0).when(data["Water"] == "Medium", 1).otherwise(2))
  data = data.withColumn("Atmosphere Color", when(data["Atmosphere Color"] == "Red", 0).when(data["Atmosphere Color"] == "Blue", 1).otherwise(2))

  cols = data.columns 
  cols.remove("Habitable")
  data = VectorAssembler(inputCols=cols, outputCol="Feature").transform(data)

  #range = 0-1
  scaler = StandardScaler(inputCol="Feature", outputCol="Scaled from Feature")
  data = scaler.fit(data).transform(data)

  return data


In [None]:
train_data = transform_data(train_data)
print(train_data.show(truncate=False))

+-----------+-----+----------------+---------+------------------+----------------------------------------------------------+
|Temperature|Water|Atmosphere Color|Habitable|Feature           |Scaled from Feature                                       |
+-----------+-----+----------------+---------+------------------+----------------------------------------------------------+
|323488     |1    |2               |1        |[323488.0,1.0,2.0]|[16.528785880945723,1.1149523301813882,2.1757737802526984]|
|319279     |0    |2               |1        |[319279.0,0.0,2.0]|[16.31372485929144,0.0,2.1757737802526984]                |
|315375     |0    |2               |1        |[315375.0,0.0,2.0]|[16.11424796964109,0.0,2.1757737802526984]                |
|302312     |1    |2               |1        |[302312.0,1.0,2.0]|[15.446787260239834,1.1149523301813882,2.1757737802526984]|
|329687     |0    |2               |1        |[329687.0,0.0,2.0]|[16.845526976986324,0.0,2.1757737802526984]               |


In [None]:
model = LogisticRegression(featuresCol='Scaled from Feature', labelCol='Habitable', maxIter=10).fit(train_data)

In [None]:
test_data = transform_data(test_data)

In [None]:
prediction = model.transform(test_data)

In [None]:
print(prediction.show(truncate=False))

+-----------+-----+----------------+---------+------------------+----------------------------------------------------------+------------------------------------------+-----------------------------------------+----------+
|Temperature|Water|Atmosphere Color|Habitable|Feature           |Scaled from Feature                                       |rawPrediction                             |probability                              |prediction|
+-----------+-----+----------------+---------+------------------+----------------------------------------------------------+------------------------------------------+-----------------------------------------+----------+
|325145     |2    |2               |1        |[325145.0,2.0,2.0]|[16.63133396133679,2.2249855521505073,2.182714540097035]  |[-0.5960950727923429,0.5960950727923429]  |[0.35523758686965035,0.6447624131303497] |1.0       |
|269079     |1    |0               |0        |[269079.0,1.0,0.0]|[13.763529228444362,1.1124927760752537,0.0]        

In [None]:
eval= BinaryClassificationEvaluator(labelCol="Habitable")
 

In [None]:
acc = eval.evaluate(prediction)


In [None]:
print("Acuraccy : ",acc*100)

Acuraccy :  91.71043337232418
