In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.types import IntegerType

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
df_train = spark.read.csv("BigFootTraining.csv", header=True, inferSchema=True)
df_test = spark.read.csv("BigFootTesting.csv", header=True, inferSchema=True)

In [4]:
df_train.show()

+---------+------+------+---------+-----------+
|fur_color|height|weight|eye_color|prehistoric|
+---------+------+------+---------+-----------+
|     grey|226 cm|122 kg|    green|       true|
|      red|294 cm|135 kg|     blue|       true|
|    white|191 cm| 94 kg|    black|       true|
|     grey|168 cm| 69 kg|    brown|      false|
|    brown|298 cm|126 kg|     blue|       true|
|    black|253 cm|111 kg|    black|       true|
|    brown|167 cm|104 kg|    green|      false|
|     grey|239 cm|162 kg|    brown|       true|
|      red|216 cm|210 kg|    brown|      false|
|      red|188 cm|105 kg|    black|      false|
|    brown|281 cm|212 kg|    black|       true|
|    black|264 cm|271 kg|    black|       true|
|    brown|252 cm|238 kg|     blue|       true|
|      red|243 cm|280 kg|    brown|       true|
|    black|282 cm| 80 kg|     blue|       true|
|   orange|262 cm| 79 kg|     blue|       true|
|    brown|258 cm|171 kg|    green|       true|
|dark grey|187 cm|216 kg|    green|     

In [5]:
df_train = df_train.select("height", "weight", "eye_color", "prehistoric")
df_test = df_test.select("height", "weight", "eye_color", "prehistoric")

In [6]:
df_train = df_train.dropna()
df_test = df_test.dropna()

In [7]:
df_train.show()

+------+------+---------+-----------+
|height|weight|eye_color|prehistoric|
+------+------+---------+-----------+
|226 cm|122 kg|    green|       true|
|294 cm|135 kg|     blue|       true|
|191 cm| 94 kg|    black|       true|
|168 cm| 69 kg|    brown|      false|
|298 cm|126 kg|     blue|       true|
|253 cm|111 kg|    black|       true|
|167 cm|104 kg|    green|      false|
|239 cm|162 kg|    brown|       true|
|216 cm|210 kg|    brown|      false|
|188 cm|105 kg|    black|      false|
|281 cm|212 kg|    black|       true|
|264 cm|271 kg|    black|       true|
|252 cm|238 kg|     blue|       true|
|243 cm|280 kg|    brown|       true|
|282 cm| 80 kg|     blue|       true|
|262 cm| 79 kg|     blue|       true|
|258 cm|171 kg|    green|       true|
|187 cm|216 kg|    green|      false|
|249 cm|205 kg|    black|       true|
|260 cm|231 kg|     blue|       true|
+------+------+---------+-----------+
only showing top 20 rows



In [8]:
df_train = df_train.withColumn("prehistoric", when(df_train["prehistoric"] == "true", 1).
                                              when(df_train["prehistoric"] == "false", 0))

df_test = df_test.withColumn("prehistoric", when(df_test["prehistoric"] == "true", 1).
                                              when(df_test["prehistoric"] == "false", 0))

In [9]:
df_train = df_train.withColumn("height", regexp_replace("height", "cm", "")).withColumn("weight", regexp_replace("weight", "kg", ""))


In [10]:
df_test = df_test.withColumn("height", regexp_replace("height", "cm", "")).withColumn("weight", regexp_replace("weight", "kg", ""))

In [11]:
df_train = df_train.withColumn("eye_color", 
                               when(df_train["eye_color"] == "green", 0).
                               when(df_train["eye_color"] == "blue", 1).
                               when(df_train["eye_color"] == "black", 2).
                               when(df_train["eye_color"] == "brown", 3))


df_test = df_test.withColumn("eye_color", 
                               when(df_test["eye_color"] == "green", 0).
                               when(df_test["eye_color"] == "blue", 1).
                               when(df_test["eye_color"] == "black", 2).
                               when(df_test["eye_color"] == "brown", 3))


In [12]:
df_train = df_train.withColumn("height", regexp_replace("height", " ", ""))
df_train = df_train.withColumn("weight", regexp_replace("weight", " ", ""))

df_test = df_test.withColumn("height", regexp_replace("height", " ", ""))
df_test = df_test.withColumn("weight", regexp_replace("weight", " ", ""))

In [13]:
df_train = df_train.withColumn("height", df_train["height"].cast(IntegerType()))
df_train = df_train.withColumn("weight", df_train["weight"].cast(IntegerType()))

df_test = df_test.withColumn("height", df_test["height"].cast(IntegerType()))
df_test = df_test.withColumn("weight", df_test["weight"].cast(IntegerType()))

In [14]:
df_train = df_train.dropna()
df_test = df_test.dropna()

In [15]:
cols = df_train.columns
cols.remove("prehistoric")

In [16]:
df_train = VectorAssembler(inputCols=cols, outputCol="VectorOut").transform(df_train)
df_test = VectorAssembler(inputCols=cols, outputCol="VectorOut").transform(df_test)

In [17]:
df_train = StandardScaler(inputCol="VectorOut", outputCol="features").fit(df_train).transform(df_train)
df_test = StandardScaler(inputCol="VectorOut", outputCol="features").fit(df_test).transform(df_test)

In [18]:
model = LogisticRegression(featuresCol="features", labelCol="prehistoric", maxIter=10).fit(df_train)
prediction = model.transform(df_test)


In [21]:
prediction.select("prediction", "prehistoric").show()

+----------+-----------+
|prediction|prehistoric|
+----------+-----------+
|       0.0|          0|
|       0.0|          1|
|       1.0|          1|
|       0.0|          0|
|       1.0|          1|
|       1.0|          0|
|       1.0|          1|
|       0.0|          0|
|       0.0|          0|
|       1.0|          0|
|       0.0|          0|
|       0.0|          0|
|       0.0|          1|
|       0.0|          0|
|       0.0|          0|
|       0.0|          0|
|       0.0|          0|
|       1.0|          1|
|       0.0|          1|
|       1.0|          1|
+----------+-----------+
only showing top 20 rows



In [23]:
evaluator = BinaryClassificationEvaluator(labelCol="prehistoric")

In [24]:
print(f'{evaluator.evaluate(prediction)}')

0.6828097129800195
