In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.types import IntegerType

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
df_train = spark.read.csv("BigFootTraining.csv", header=True, inferSchema=True)
df_test = spark.read.csv("BigFootTesting.csv", header=True, inferSchema=True)

In [4]:
df_train.show()

+---------+------+------+---------+-----------+
|fur_color|height|weight|eye_color|prehistoric|
+---------+------+------+---------+-----------+
|     grey|226 cm|122 kg|    green|       true|
|      red|294 cm|135 kg|     blue|       true|
|    white|191 cm| 94 kg|    black|       true|
|     grey|168 cm| 69 kg|    brown|      false|
|    brown|298 cm|126 kg|     blue|       true|
|    black|253 cm|111 kg|    black|       true|
|    brown|167 cm|104 kg|    green|      false|
|     grey|239 cm|162 kg|    brown|       true|
|      red|216 cm|210 kg|    brown|      false|
|      red|188 cm|105 kg|    black|      false|
|    brown|281 cm|212 kg|    black|       true|
|    black|264 cm|271 kg|    black|       true|
|    brown|252 cm|238 kg|     blue|       true|
|      red|243 cm|280 kg|    brown|       true|
|    black|282 cm| 80 kg|     blue|       true|
|   orange|262 cm| 79 kg|     blue|       true|
|    brown|258 cm|171 kg|    green|       true|
|dark grey|187 cm|216 kg|    green|     

In [5]:
df_train = df_train.select("fur_color", "height", "eye_color", "prehistoric")
df_test = df_test.select("fur_color", "height", "eye_color", "prehistoric")

In [6]:
df_train = df_train.dropna()
df_test = df_test.dropna()

In [7]:
df_train = df_train.withColumn("fur_color", 
                               when(df_train["fur_color"] == "white", 0).
                               when(df_train["fur_color"] == "grey", 1).
                               when(df_train["fur_color"] == "red", 2).
                               when(df_train["fur_color"] == "orange", 3).
                               when(df_train["fur_color"] == "calico", 4).
                               when(df_train["fur_color"] == "brown", 5).
                               when(df_train["fur_color"] == "black", 6).
                               when(df_train["fur_color"] == "dark grey", 7))

df_train = df_train.withColumn("height", regexp_replace("height", "cm", ""))
df_train = df_train.withColumn("prehistoric", 
                               when(df_train["prehistoric"] == "true", 1).
                               when(df_train["prehistoric"] == "false", 0))
df_train = df_train.withColumn("eye_color", 
                               when(df_train["eye_color"] == "green", 0).
                               when(df_train["eye_color"] == "blue", 1).
                               when(df_train["eye_color"] == "black", 2).
                               when(df_train["eye_color"] == "brown", 3))


df_test = df_test.withColumn("fur_color", 
                               when(df_test["fur_color"] == "white", 0).
                               when(df_test["fur_color"] == "grey", 1).
                               when(df_test["fur_color"] == "red", 2).
                               when(df_test["fur_color"] == "orange", 3).
                               when(df_test["fur_color"] == "calico", 4).
                               when(df_test["fur_color"] == "brown", 5).
                               when(df_test["fur_color"] == "black", 6).
                               when(df_test["fur_color"] == "dark grey", 7))

df_test = df_test.withColumn("height", regexp_replace("height", "cm", ""))
df_test = df_test.withColumn("prehistoric", 
                               when(df_test["prehistoric"] == "true", 1).
                               when(df_test["prehistoric"] == "false", 0))
df_test = df_test.withColumn("eye_color", 
                               when(df_test["eye_color"] == "green", 0).
                               when(df_test["eye_color"] == "blue", 1).
                               when(df_test["eye_color"] == "black", 2).
                               when(df_test["eye_color"] == "brown", 3))


In [8]:
df_train.show()

+---------+------+---------+-----------+
|fur_color|height|eye_color|prehistoric|
+---------+------+---------+-----------+
|        1|  226 |        0|          1|
|        2|  294 |        1|          1|
|        0|  191 |        2|          1|
|        1|  168 |        3|          0|
|        5|  298 |        1|          1|
|        6|  253 |        2|          1|
|        5|  167 |        0|          0|
|        1|  239 |        3|          1|
|        2|  216 |        3|          0|
|        2|  188 |        2|          0|
|        5|  281 |        2|          1|
|        6|  264 |        2|          1|
|        5|  252 |        1|          1|
|        2|  243 |        3|          1|
|        6|  282 |        1|          1|
|        3|  262 |        1|          1|
|        5|  258 |        0|          1|
|        7|  187 |        0|          0|
|        5|  249 |        2|          1|
|        7|  227 |        3|          1|
+---------+------+---------+-----------+
only showing top

In [9]:
df_train = df_train.withColumn("height", regexp_replace("height", " ", ""))


df_test = df_test.withColumn("height", regexp_replace("height", " ", ""))


In [10]:
df_train.show()

+---------+------+---------+-----------+
|fur_color|height|eye_color|prehistoric|
+---------+------+---------+-----------+
|        1|   226|        0|          1|
|        2|   294|        1|          1|
|        0|   191|        2|          1|
|        1|   168|        3|          0|
|        5|   298|        1|          1|
|        6|   253|        2|          1|
|        5|   167|        0|          0|
|        1|   239|        3|          1|
|        2|   216|        3|          0|
|        2|   188|        2|          0|
|        5|   281|        2|          1|
|        6|   264|        2|          1|
|        5|   252|        1|          1|
|        2|   243|        3|          1|
|        6|   282|        1|          1|
|        3|   262|        1|          1|
|        5|   258|        0|          1|
|        7|   187|        0|          0|
|        5|   249|        2|          1|
|        7|   227|        3|          1|
+---------+------+---------+-----------+
only showing top

In [11]:
df_train = df_train.withColumn("height", df_train["height"].cast(IntegerType()))


df_test = df_test.withColumn("height", df_test["height"].cast(IntegerType()))


In [12]:
df_train = df_train.dropna()
df_test = df_test.dropna()

In [13]:
cols = df_train.columns
cols.remove("prehistoric")

df_train = VectorAssembler(inputCols=cols, outputCol="VectorOut").transform(df_train)
df_test = VectorAssembler(inputCols=cols, outputCol="VectorOut").transform(df_test)

In [14]:
df_train = StandardScaler(inputCol="VectorOut", outputCol="features").fit(df_train).transform(df_train)
df_test = StandardScaler(inputCol="VectorOut", outputCol="features").fit(df_test).transform(df_test)

In [15]:
model = LogisticRegression(featuresCol="features", labelCol="prehistoric", maxIter=10).fit(df_train)
predictionResult = model.transform(df_test)

predictionResult.select("prehistoric", "prediction").show()


+-----------+----------+
|prehistoric|prediction|
+-----------+----------+
|          0|       0.0|
|          1|       1.0|
|          1|       1.0|
|          0|       0.0|
|          1|       0.0|
|          0|       1.0|
|          1|       0.0|
|          0|       0.0|
|          0|       0.0|
|          0|       1.0|
|          0|       0.0|
|          0|       0.0|
|          1|       0.0|
|          0|       0.0|
|          0|       1.0|
|          0|       0.0|
|          0|       1.0|
|          1|       1.0|
|          1|       1.0|
|          1|       0.0|
+-----------+----------+
only showing top 20 rows



In [16]:
evaluator = BinaryClassificationEvaluator(labelCol="prehistoric")
print(f'{evaluator.evaluate(predictionResult)}')

0.655366116520672
