In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark
!pip install -q pyspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"
import findspark
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext

findspark.init("spark-2.4.5-bin-hadoop2.7")
sc = pyspark.SparkContext('local[*]')
spark = SparkSession.builder.appName('abc').getOrCreate()

[K     |████████████████████████████████| 218.4MB 63kB/s 
[K     |████████████████████████████████| 204kB 48.0MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


# SPARK ML

Dwa moduly ML  
RDD API -> SPARK MLLib  
DF API -> SPARK ML

## Drzewa Decyzyjne

In [0]:
df = spark.read.csv('train.csv', header=True, inferSchema=True) #wczytuje dataset

żeby ML działał w sparku, należy stworzyć tabelę dwukolumnową w której pierwsza kolumna jest zmienną objaśnianą a w drugiej znajduje się lista miennych objaśniających dla danego wiersza.  
Robi się to w następujący sposób:

In [0]:
from pyspark.ml.feature import VectorAssembler # potrzebny moduł

In [0]:
cols = [col for col in df.columns if col not in ['Id', 'Cover_Type']] # wybieram kolumny zmiennych objaśniających

assembler = VectorAssembler(inputCols=cols, outputCol='features') #tworzę assembler
tdf = assembler.transform(df).select(['Cover_Type', 'features']) #transformuje df z posiadanej instancji obiektu assemblera i wybieram interesujące mnie kolumny

In [0]:
training_data, test_data = tdf.randomSplit([.7, .3]) # dziele dane na testowe i treningowe

In [0]:
 from pyspark.ml.feature import StringIndexer # moduł potrzebny do stworzenia modelu

In [0]:
label_indexer = StringIndexer(inputCol='Cover_Type', outputCol='label').fit(tdf) #fituje model

In [0]:
from pyspark.ml.classification import DecisionTreeClassifier # moduł modelu

In [0]:
dt = DecisionTreeClassifier(labelCol='label',
                            featuresCol='features',
                            impurity='gini',
                            maxDepth=5,
                            maxBins=32) # szkielet modelu

In [0]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[label_indexer, dt]) #kolejkuje zadania, najpierw indeksuje, potem trenuje model

In [0]:
model = pipeline.fit(training_data) # trenuje model

In [0]:
tree_model = model.stages[1] #wybieram obiekt modelu drzewa decyzyjnego

robienie predykcji na danych treningowych.

In [0]:
predictions = model.transform(training_data) # transform == predict 

In [0]:
# ewaluatory, do oceny modelu
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol='label',
                                              predictionCol='prediction',
                                              metricName='accuracy')

In [31]:
accuracy = evaluator.evaluate(predictions)
print('Accuracy {}\nTrain Error {}'.format(accuracy, 1-accuracy))

Accuracy 0.6697195732225475
Train Error 0.3302804267774525


Predykcja na danych testowych

In [33]:
predictions = model.transform(test_data) # transform == predict 
evaluator = MulticlassClassificationEvaluator(labelCol='label',
                                              predictionCol='prediction',
                                              metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
print('Accuracy {}\Test Error {}'.format(accuracy, 1-accuracy))

Accuracy 0.659085890925149
Train Error 0.340914109074851


Z powyzszych wyszedl nam underfitting, trzeba poprawic model

In [0]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder # potrzebne moduły

In [0]:
# tworze zestaw parametrow do dostrajania modelu

param_grid = (ParamGridBuilder()
.addGrid(dt.impurity, ['entropy', 'gini'])
.addGrid(dt.maxDepth, [10,30])
.addGrid(dt.maxBins, [30, 100, 300])
.build()
)

In [0]:
# dla wydajnosci, wrzucam dane do pamieci podrecznej

training_data = training_data.cache()
test_data = test_data.cache()

In [0]:
# tworze walidacje krzyzowa
evaluator = MulticlassClassificationEvaluator() # nowy ewaluator bez parametrow

crossv = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=param_grid,
    evaluator = evaluator,
    numFolds = 3
)

In [0]:
cv_model = crossv.fit(training_data)

In [40]:
# sprawdzam dopasowanie na danych testowych

prediction = cv_model.bestModel.transform(training_data)

accuracy = evaluator.evaluate(prediction)
print('Accuracy {}\nTrain Error {}'.format(accuracy, 1-accuracy))

1.0


In [41]:
# sprawdzam dopasowanie na danych testowych

prediction = cv_model.bestModel.transform(test_data)

accuracy = evaluator.evaluate(prediction)
print('Accuracy {}\nTrain Error {}'.format(accuracy, 1-accuracy))

Accuracy 0.7769927136233165
Train Error 0.22300728637668354


w powyzszym wyszedl overfitting, trzeba znalezc lepszy model

# Random Forest

## Data Engineering

In [43]:
df.limit(5).toPandas()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,Soil_Type7,Soil_Type8,Soil_Type9,Soil_Type10,Soil_Type11,Soil_Type12,Soil_Type13,Soil_Type14,Soil_Type15,Soil_Type16,Soil_Type17,Soil_Type18,Soil_Type19,Soil_Type20,Soil_Type21,Soil_Type22,Soil_Type23,Soil_Type24,Soil_Type25,Soil_Type26,Soil_Type27,Soil_Type28,Soil_Type29,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,1,2596,51,3,258,0,510,221,232,148,6279,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
1,2,2590,56,2,212,-6,390,220,235,151,6225,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
2,3,2804,139,9,268,65,3180,234,238,135,6121,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
3,4,2785,155,18,242,118,3090,238,238,122,6211,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2
4,5,2595,45,2,153,-1,391,220,234,150,6172,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5


In [0]:
 import pyspark.sql.types as t
 import pyspark.sql.functions as f
 

In [0]:
df = df.withColumn('Id', f.col('Id').cast(t.StringType())) # zmieniam id na string

In [0]:
# Ekstraktowanie kolumn Wilderness_Area

wild_columns = [
                'Id',
                'Wilderness_Area1',
                'Wilderness_Area2',
                'Wilderness_Area3',
                'Wilderness_Area4'
                ]

wild = df.select(wild_columns)

In [0]:
# ekstraktowanie kolumn soil_type

soil_type_cols = [x for x in df.columns if x.startswith('Soil_Type')]
soil_types = df.select(['Id'] + soil_type_cols)

In [0]:
from pyspark.sql import Row
wild_row = Row('Id', 'Wilderness_Area')
soil_row = Row('Id', 'Soil_Type')

# laczenie kolumn

wild_new = wild.rdd.map(lambda x: wild_row(x[0], x.index(1))).toDF()
soil_new = soil_types.rdd.map(lambda x: soil_row(x[0], x.index(1))).toDF()

In [0]:
# ekstraktowanie pozostalych kolumn

other_cols = [x for x in df.columns if (not x.startswith('Soil_Type') and not x.startswith('Wilder'))]
other = df.select(other_cols)

In [0]:
# join na tabelach po Id
ndf = (other
       .withColumn('Id', f.col('Id').cast(t.IntegerType()))
       .join(soil_new, 'Id')
       .join(wild_new, 'Id'))

## Budowanie Modelu

In [87]:
# przygotowanie danych do ML

from pyspark.ml.feature import VectorAssembler

ignore = ['Id', 'Cover_Type']
assembler = VectorAssembler(
    inputCols = [x for x in ndf.columns if x not in ignore],
    outputCol = 'features'
)

ntdf = assembler.transform(ndf).select(['Cover_Type', 'features'])
ntdf.limit(5).toPandas()

Unnamed: 0,Cover_Type,features
0,2,"[3142.0, 220.0, 11.0, 424.0, 69.0, 6216.0, 207..."
1,2,"[2792.0, 101.0, 6.0, 108.0, 29.0, 3180.0, 230...."
2,2,"[2773.0, 81.0, 14.0, 108.0, 24.0, 3019.0, 238...."
3,2,"[2909.0, 72.0, 5.0, 324.0, 5.0, 4554.0, 226.0,..."
4,1,"[3123.0, 94.0, 12.0, 342.0, 33.0, 3544.0, 238...."


In [77]:
# sprawdzenie typu wektora
ntdf.rdd.first()

Row(Cover_Type=2, Features=DenseVector([3142.0, 220.0, 11.0, 424.0, 69.0, 6216.0, 207.0, 251.0, 179.0, 1989.0, 29.0, 1.0]))

In [0]:
# podzial danych na testowe i treningowe
training_data, test_data = ntdf.randomSplit([.7, .3])

In [0]:
# indeksowanie danych

from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer


label_indexer = StringIndexer(inputCol='Cover_Type',
                              outputCol='indexed_label').fit(ntdf)

feature_indexer = VectorIndexer(inputCol='features',
                                outputCol='indexed_features',
                                maxCategories=41).fit(ntdf)


In [0]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline

rf = RandomForestClassifier(labelCol = 'indexed_label',
                            featuresCol='indexed_features',
                            numTrees = 10,
                            maxDepth = 30,
                            maxBins = 100,
                            impurity = 'entropy'
                            )

label_converter = IndexToString(inputCol='prediction',
                                outputCol='predictedLabel',
                                labels=label_indexer.labels)

pipeline = Pipeline(stages=[label_indexer, feature_indexer, rf, label_converter])

In [0]:
training_data = training_data.cache()
test_data = test_data.cache()

In [0]:
# trenowanie modelu

model = pipeline.fit(training_data)

In [95]:
# ewaluacja modelu

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
predictions = model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol='indexed_label',
                                         predictionCol='prediction',
                                         metricName='accuracy')

accuracy = evaluator.evaluate(predictions)

print("Test accuracy: {}\nTest error: {}".format(accuracy, 1-accuracy))

Test accuracy: 0.8294746215494212
Test error: 0.1705253784505788
