# Start a Spark Session

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('pipeline').getOrCreate()

# Load data from HDFS

In [2]:
#create dataframe by loading data with sqlContext.read()
row_df = spark.read.format("csv")\
         .option("header", "true")\
         .option("delimiter", "\t")\
         .load("hdfs://mycluster/user/dtree/data/train.tsv")
print(row_df.count())

7395


In [3]:
type(row_df)

pyspark.sql.dataframe.DataFrame

In [4]:
row_df.printSchema()

root
 |-- url: string (nullable = true)
 |-- urlid: string (nullable = true)
 |-- boilerplate: string (nullable = true)
 |-- alchemy_category: string (nullable = true)
 |-- alchemy_category_score: string (nullable = true)
 |-- avglinksize: string (nullable = true)
 |-- commonlinkratio_1: string (nullable = true)
 |-- commonlinkratio_2: string (nullable = true)
 |-- commonlinkratio_3: string (nullable = true)
 |-- commonlinkratio_4: string (nullable = true)
 |-- compression_ratio: string (nullable = true)
 |-- embed_ratio: string (nullable = true)
 |-- framebased: string (nullable = true)
 |-- frameTagRatio: string (nullable = true)
 |-- hasDomainLink: string (nullable = true)
 |-- html_ratio: string (nullable = true)
 |-- image_ratio: string (nullable = true)
 |-- is_news: string (nullable = true)
 |-- lengthyLinkDomain: string (nullable = true)
 |-- linkwordscore: string (nullable = true)
 |-- news_front_page: string (nullable = true)
 |-- non_markup_alphanum_characters: string (nulla

In [5]:
row_df.select('url', 'alchemy_category', 'alchemy_category_score', 'is_news', 'label').show(10)

+--------------------+------------------+----------------------+-------+-----+
|                 url|  alchemy_category|alchemy_category_score|is_news|label|
+--------------------+------------------+----------------------+-------+-----+
|http://www.bloomb...|          business|              0.789131|      1|    0|
|http://www.popsci...|        recreation|              0.574147|      1|    1|
|http://www.menshe...|            health|              0.996526|      1|    1|
|http://www.dumbli...|            health|              0.801248|      1|    1|
|http://bleacherre...|            sports|              0.719157|      1|    0|
|http://www.conven...|                 ?|                     ?|      ?|    0|
|http://gofashionl...|arts_entertainment|               0.22111|      1|    1|
|http://www.inside...|                 ?|                     ?|      ?|    0|
|http://www.valetm...|                 ?|                     ?|      1|    1|
|http://www.howswe...|                 ?|           

In [6]:
#set up UDF Function
from pyspark.sql.functions import udf

def replace_question(x):
    return ("0" if x=="?" else x)

#transform replace_question into DataFrames UDF 
replace_question=udf(replace_question)

In [9]:
# returns a column based on the given column name
from pyspark.sql.functions import col 

df = row_df.select(
    ['url', 'alchemy_category']+  #select columns without further transformation
    [replace_question(col(column)).cast("double").alias(column) 
    for column in row_df.columns[4:]])
     #transform the type of column from string to double 
     #perform UDF on columns starting from the forth


In [10]:
df.select('url', 'alchemy_category', 'alchemy_category_score', 'is_news', 'label').show(10)

+--------------------+------------------+----------------------+-------+-----+
|                 url|  alchemy_category|alchemy_category_score|is_news|label|
+--------------------+------------------+----------------------+-------+-----+
|http://www.bloomb...|          business|              0.789131|    1.0|  0.0|
|http://www.popsci...|        recreation|              0.574147|    1.0|  1.0|
|http://www.menshe...|            health|              0.996526|    1.0|  1.0|
|http://www.dumbli...|            health|              0.801248|    1.0|  1.0|
|http://bleacherre...|            sports|              0.719157|    1.0|  0.0|
|http://www.conven...|                 ?|                   0.0|    0.0|  0.0|
|http://gofashionl...|arts_entertainment|               0.22111|    1.0|  1.0|
|http://www.inside...|                 ?|                   0.0|    0.0|  0.0|
|http://www.valetm...|                 ?|                   0.0|    1.0|  1.0|
|http://www.howswe...|                 ?|           

In [11]:
print (df.printSchema())

root
 |-- url: string (nullable = true)
 |-- alchemy_category: string (nullable = true)
 |-- alchemy_category_score: double (nullable = true)
 |-- avglinksize: double (nullable = true)
 |-- commonlinkratio_1: double (nullable = true)
 |-- commonlinkratio_2: double (nullable = true)
 |-- commonlinkratio_3: double (nullable = true)
 |-- commonlinkratio_4: double (nullable = true)
 |-- compression_ratio: double (nullable = true)
 |-- embed_ratio: double (nullable = true)
 |-- framebased: double (nullable = true)
 |-- frameTagRatio: double (nullable = true)
 |-- hasDomainLink: double (nullable = true)
 |-- html_ratio: double (nullable = true)
 |-- image_ratio: double (nullable = true)
 |-- is_news: double (nullable = true)
 |-- lengthyLinkDomain: double (nullable = true)
 |-- linkwordscore: double (nullable = true)
 |-- news_front_page: double (nullable = true)
 |-- non_markup_alphanum_characters: double (nullable = true)
 |-- numberOfLinks: double (nullable = true)
 |-- numwords_in_url: d

# Split data into training set & test set

In [12]:
train_df, test_df = df.randomSplit([0.7, 0.3])

In [14]:
train_df.count(), test_df.count()

(5152, 2243)

In [17]:
train_df.take(1)

[Row(url='http://1000awesomethings.com/2008/12/29/864-mastering-the-art-of-the-all-you-can-eat-buffet/', alchemy_category='arts_entertainment', alchemy_category_score=0.5811, avglinksize=2.526785714, commonlinkratio_1=0.680672269, commonlinkratio_2=0.327731092, commonlinkratio_3=0.268907563, commonlinkratio_4=0.260504202, compression_ratio=0.471047228, embed_ratio=0.0, framebased=0.0, frameTagRatio=0.027673897, hasDomainLink=0.0, html_ratio=0.218407242, image_ratio=0.105263158, is_news=1.0, lengthyLinkDomain=1.0, linkwordscore=13.0, news_front_page=0.0, non_markup_alphanum_characters=16382.0, numberOfLinks=238.0, numwords_in_url=8.0, parametrizedLinkRatio=0.336134454, spelling_errors_ratio=0.094298246, label=1.0)]

# Prepare data #1 - StringIndexer

In [18]:
from pyspark.ml.feature import StringIndexer
categoryIndexer = StringIndexer(
                    inputCol='alchemy_category',
                    outputCol='alchemy_category_Index')

In [19]:
categoryTransformer=categoryIndexer.fit(df)

In [20]:
type(categoryTransformer)

pyspark.ml.feature.StringIndexerModel

In [23]:
df_1=categoryTransformer.transform(train_df)

In [25]:
#index category of each record
df_1.select("alchemy_category", "alchemy_category_Index").show(15)

+------------------+----------------------+
|  alchemy_category|alchemy_category_Index|
+------------------+----------------------+
|arts_entertainment|                   2.0|
|                 ?|                   0.0|
|                 ?|                   0.0|
|          business|                   3.0|
|arts_entertainment|                   2.0|
|                 ?|                   0.0|
|                 ?|                   0.0|
|                 ?|                   0.0|
|                 ?|                   0.0|
|        recreation|                   1.0|
| computer_internet|                   7.0|
|          business|                   3.0|
|arts_entertainment|                   2.0|
|            sports|                   5.0|
|                 ?|                   0.0|
+------------------+----------------------+
only showing top 15 rows



# Prepare data #2 - OneHotEncoder

In [33]:
from pyspark.ml.feature import OneHotEncoderEstimator
encoder = OneHotEncoderEstimator(dropLast=False,
                       inputCols=["alchemy_category_Index"],
                       outputCols=["alchemy_category_IndexVec"])

In [35]:
#Fits a model to the input dataset with optional parameters.
df_2 = encoder.fit(df_1)

In [37]:
#Transforms the input dataset with optional parameters.
df_2 = df_2.transform(df_1)

In [39]:
df_2.select("alchemy_category", "alchemy_category_Index", "alchemy_category_IndexVec").show(15)

+------------------+----------------------+-------------------------+
|  alchemy_category|alchemy_category_Index|alchemy_category_IndexVec|
+------------------+----------------------+-------------------------+
|arts_entertainment|                   2.0|           (14,[2],[1.0])|
|                 ?|                   0.0|           (14,[0],[1.0])|
|                 ?|                   0.0|           (14,[0],[1.0])|
|          business|                   3.0|           (14,[3],[1.0])|
|arts_entertainment|                   2.0|           (14,[2],[1.0])|
|                 ?|                   0.0|           (14,[0],[1.0])|
|                 ?|                   0.0|           (14,[0],[1.0])|
|                 ?|                   0.0|           (14,[0],[1.0])|
|                 ?|                   0.0|           (14,[0],[1.0])|
|        recreation|                   1.0|           (14,[1],[1.0])|
| computer_internet|                   7.0|           (14,[7],[1.0])|
|          business|

# Prepare data#3 - VectorAssembler

In [44]:
from pyspark.ml.feature import VectorAssembler
#select all columns without "label" and the first three columns
assemblerInputs=['alchemy_category_IndexVec']+row_df.columns[4:-1]
print(assemblerInputs)

['alchemy_category_IndexVec', 'alchemy_category_score', 'avglinksize', 'commonlinkratio_1', 'commonlinkratio_2', 'commonlinkratio_3', 'commonlinkratio_4', 'compression_ratio', 'embed_ratio', 'framebased', 'frameTagRatio', 'hasDomainLink', 'html_ratio', 'image_ratio', 'is_news', 'lengthyLinkDomain', 'linkwordscore', 'news_front_page', 'non_markup_alphanum_characters', 'numberOfLinks', 'numwords_in_url', 'parametrizedLinkRatio', 'spelling_errors_ratio']


In [47]:
#VectorAssembler is a transformer that combines a given list of columns into a single vector column. 
assembler = VectorAssembler(
            inputCols=assemblerInputs,
            outputCol="features")

In [48]:
df_3 = assembler.transform(df_2)

In [58]:
print(df_3.columns)

['url', 'alchemy_category', 'alchemy_category_score', 'avglinksize', 'commonlinkratio_1', 'commonlinkratio_2', 'commonlinkratio_3', 'commonlinkratio_4', 'compression_ratio', 'embed_ratio', 'framebased', 'frameTagRatio', 'hasDomainLink', 'html_ratio', 'image_ratio', 'is_news', 'lengthyLinkDomain', 'linkwordscore', 'news_front_page', 'non_markup_alphanum_characters', 'numberOfLinks', 'numwords_in_url', 'parametrizedLinkRatio', 'spelling_errors_ratio', 'label', 'alchemy_category_Index', 'alchemy_category_IndexVec', 'features']


In [53]:
df_3.select('features').show(2)

+--------------------+
|            features|
+--------------------+
|(36,[2,14,15,16,1...|
|(36,[0,15,16,20,2...|
+--------------------+
only showing top 2 rows



In [54]:
df_3.select('features').take(1)

[Row(features=SparseVector(36, {2: 1.0, 14: 0.5811, 15: 2.5268, 16: 0.6807, 17: 0.3277, 18: 0.2689, 19: 0.2605, 20: 0.471, 23: 0.0277, 25: 0.2184, 26: 0.1053, 27: 1.0, 28: 1.0, 29: 13.0, 31: 16382.0, 32: 238.0, 33: 8.0, 34: 0.3361, 35: 0.0943}))]

# Build a decision tree model

In [60]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features",
                           impurity="gini", maxDepth=10, maxBins=14)

In [61]:
dt_model = dt.fit(df_3)

In [62]:
print(dt_model)

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_4abe8d5a650c6501b3fc) of depth 10 with 721 nodes


In [64]:
df_4 = dt_model.transform(df_3)

In [65]:
df_3.count()

5152

# Build ML pipeline

In [69]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier

StringIndexer = StringIndexer(
                    inputCol='alchemy_category',
                    outputCol='alchemy_category_Index')

encoder = OneHotEncoderEstimator(dropLast=False,
                       inputCols=["alchemy_category_Index"],
                       outputCols=["alchemy_category_IndexVec"])

assemblerInputs=['alchemy_category_IndexVec']+row_df.columns[4:-1]
assembler = VectorAssembler(
            inputCols=assemblerInputs,
            outputCol="features")

dt = DecisionTreeClassifier(labelCol="label", featuresCol="features",
                           impurity="gini", maxDepth=10, maxBins=14)

dt_pipeline = Pipeline(stages=[StringIndexer,encoder,assembler, dt])


In [71]:
dt_pipeline.getStages()

[StringIndexer_47929903b50f674f8a35,
 OneHotEncoderEstimator_4982a7da1890f6f173bc,
 VectorAssembler_485e86381d632fdb53b4,
 DecisionTreeClassifier_4ecb8eeab1eff1752f8e]

In [73]:
pipelineModel=dt_pipeline.fit(train_df)

In [74]:
print(pipelineModel.stages[3].toDebugString)

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_4ecb8eeab1eff1752f8e) of depth 10 with 721 nodes
  If (feature 31 <= 1798.0)
   If (feature 2 in {1.0})
    If (feature 26 <= 0.20197163950000002)
     If (feature 25 <= 0.166685731)
      Predict: 1.0
     Else (feature 25 > 0.166685731)
      If (feature 20 <= 0.485028182)
       If (feature 16 <= 0.36377084449999997)
        If (feature 14 <= 0.3662915)
         Predict: 1.0
        Else (feature 14 > 0.3662915)
         If (feature 33 <= 7.5)
          Predict: 0.0
         Else (feature 33 > 7.5)
          Predict: 1.0
       Else (feature 16 > 0.36377084449999997)
        If (feature 16 <= 0.5582905465)
         Predict: 1.0
        Else (feature 16 > 0.5582905465)
         If (feature 14 <= 0.7377425)
          Predict: 0.0
         Else (feature 14 > 0.7377425)
          Predict: 1.0
      Else (feature 20 > 0.485028182)
       If (feature 35 <= 0.078101539)
        If (feature 31 <= 1315.5)
         If (feature 29 <= 

# Predict the result with test set

In [75]:
predicted=pipelineModel.transform(test_df)

In [76]:
print(predicted.columns)

['url', 'alchemy_category', 'alchemy_category_score', 'avglinksize', 'commonlinkratio_1', 'commonlinkratio_2', 'commonlinkratio_3', 'commonlinkratio_4', 'compression_ratio', 'embed_ratio', 'framebased', 'frameTagRatio', 'hasDomainLink', 'html_ratio', 'image_ratio', 'is_news', 'lengthyLinkDomain', 'linkwordscore', 'news_front_page', 'non_markup_alphanum_characters', 'numberOfLinks', 'numwords_in_url', 'parametrizedLinkRatio', 'spelling_errors_ratio', 'label', 'alchemy_category_Index', 'alchemy_category_IndexVec', 'features', 'rawPrediction', 'probability', 'prediction']


In [77]:
predicted.count()

2243

In [78]:
predicted.select('url', 'features', 'rawprediction', 'probability', 'label', 'prediction').show(10)

+--------------------+--------------------+-------------+--------------------+-----+----------+
|                 url|            features|rawprediction|         probability|label|prediction|
+--------------------+--------------------+-------------+--------------------+-----+----------+
|http://1000awesom...|(36,[0,15,16,17,1...| [15.0,128.0]|[0.10489510489510...|  1.0|       1.0|
|http://1000awesom...|(36,[1,14,15,16,1...|    [0.0,8.0]|           [0.0,1.0]|  1.0|       1.0|
|http://100miledie...|(36,[0,15,20,23,2...|    [0.0,9.0]|           [0.0,1.0]|  0.0|       1.0|
|http://17andbakin...|(36,[3,14,15,16,1...|    [1.0,0.0]|           [1.0,0.0]|  1.0|       0.0|
|http://17andbakin...|(36,[2,14,15,16,1...|[127.0,150.0]|[0.45848375451263...|  1.0|       1.0|
|http://1x.com/pho...|(36,[0,15,16,17,2...|[127.0,204.0]|[0.38368580060422...|  1.0|       1.0|
|http://2oddities....|(36,[8,14,15,16,1...|    [3.0,0.0]|           [1.0,0.0]|  1.0|       0.0|
|http://3kidsandus...|(36,[0,15,16,17,1.

In [81]:
#probability=[probability of 0, probability of 1]
predicted.select('probability', 'label', 'prediction').take(10)

[Row(probability=DenseVector([0.1049, 0.8951]), label=1.0, prediction=1.0),
 Row(probability=DenseVector([0.0, 1.0]), label=1.0, prediction=1.0),
 Row(probability=DenseVector([0.0, 1.0]), label=0.0, prediction=1.0),
 Row(probability=DenseVector([1.0, 0.0]), label=1.0, prediction=0.0),
 Row(probability=DenseVector([0.4585, 0.5415]), label=1.0, prediction=1.0),
 Row(probability=DenseVector([0.3837, 0.6163]), label=1.0, prediction=1.0),
 Row(probability=DenseVector([1.0, 0.0]), label=1.0, prediction=0.0),
 Row(probability=DenseVector([0.1049, 0.8951]), label=0.0, prediction=1.0),
 Row(probability=DenseVector([0.9796, 0.0204]), label=1.0, prediction=0.0),
 Row(probability=DenseVector([0.7857, 0.2143]), label=0.0, prediction=0.0)]

# Set up an evaluator

In [82]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator=BinaryClassificationEvaluator(
            rawPredictionCol = "rawPrediction",
            labelCol="label",
            metricName="areaUnderROC")

predictions=pipelineModel.transform(test_df)
auc = evaluator.evaluate(predictions)
auc

0.6196074452508276