In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 34 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 61.4 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=9431b18073caef4491fbc92f10b57e30b42a356f4b0a5a9e465ce81eac0350e6
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [103]:
from pyspark.sql.types import *
file_data = "/content/weatherAUS.csv"
file_Schema = StructType([
      StructField("Date",StringType(),True),
      StructField("Location",StringType(),True),
      StructField("MinTemp",FloatType(),True),
      StructField("MaxTemp",FloatType(),True),
      StructField("Rainfall",FloatType(),True),
      StructField("Evaporation",StringType(),True),
      StructField("Sunshine",StringType(),True),
      StructField("WindGustDir",StringType(),True),
      StructField("WindGustSpeed",FloatType(),True),
      StructField("WindDir9am",StringType(),True),
      StructField("WindDir3pm",StringType(),True),
      StructField("WindSpeed9am",FloatType(),True),
      StructField("WindSpeed3pm",FloatType(),True),
      StructField("Humidity9am",FloatType(),True),
      StructField("Humidity3pm",FloatType(),True),
      StructField("Pressure9am",FloatType(),True),
      StructField("Pressure3pm",FloatType(),True),
      StructField("Cloud9am",FloatType(),True),
      StructField("Cloud3pm",FloatType(),True),
      StructField("Temp9am",FloatType(),True),
      StructField("Temp3pm",FloatType(),True),
     # StructField("RainToday",StringType(),True),
      StructField("RainTomorrow",StringType(),True)
])

In [19]:
df = spark.read.csv(file_data, header="true", schema=file_Schema,nullValue= 'NA')
df = df.drop("Date", "Evaporation","Sunshine","Cloud9am", "Cloud3pm", 'WindGustDir', 'WindGustSpeed')

In [20]:
df = df.dropna()

In [87]:
(train, test) = df.randomSplit([0.8, 0.2])

In [88]:
categoricalColumns = ["Location", "WindDir9am", "WindDir3pm" , "RainToday"]


In [90]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml import Pipeline
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
   
    

In [91]:

stages = []
for categoricalCol in categoricalColumns:
      stages += [stringIndexer, encoder]

In [92]:
stages

[StringIndexer_cd6e18d76717,
 OneHotEncoder_b9f2c4c3db35,
 StringIndexer_cd6e18d76717,
 OneHotEncoder_b9f2c4c3db35,
 StringIndexer_cd6e18d76717,
 OneHotEncoder_b9f2c4c3db35,
 StringIndexer_cd6e18d76717,
 OneHotEncoder_b9f2c4c3db35]

In [93]:
numeric_Cols = ["MinTemp", "MaxTemp", "WindSpeed9am", "WindSpeed3pm", "Humidity9am", "Humidity3pm", "Temp9am", "Temp3pm", "Pressure9am", "Pressure3pm"]

In [94]:
label_stringIndex = StringIndexer(inputCol="RainTomorrow", outputCol="label")
stages += [label_stringIndex]

In [95]:
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numeric_Cols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]
dtree = DecisionTreeClassifier(labelCol="label", featuresCol=assembler.getOutputCol())

In [96]:
paramGrid = (ParamGridBuilder()
    .addGrid(dtree.impurity, ['gini', 'entropy'])
    .addGrid(dtree.maxBins, [5, 10, 15])
    .addGrid(dtree.minInfoGain, [0.0, 0.2, 0.4])
    .addGrid(dtree.maxDepth, [3, 5, 7])
    .build())

In [97]:
evaluator = BinaryClassificationEvaluator()

In [98]:
cv = CrossValidator(estimator=dtree, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=4)
stages += [cv]

In [105]:
from pyspark.ml import Pipeline

In [111]:
pipeline = Pipeline().setStages(stages)


In [117]:
pipeline_model = pipeline.fit(train)
prediction = pipeline_model.transform(test)

IllegalArgumentException: ignored