In [1]:
# Imports
from pyspark.ml import Pipeline 

from pyspark.ml.classification import LogisticRegression 
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import RandomForestClassifier

from pyspark.ml.evaluation import BinaryClassificationEvaluator 

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.ml.feature import HashingTF, IDF, Tokenizer 

from pyspark.ml.feature import OneHotEncoder, IndexToString, StringIndexer, VectorIndexer, VectorAssembler

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder 

from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark import SparkContext 
from pyspark.sql.functions import col
from pyspark import SQLContext 
from sklearn.metrics import confusion_matrix
from pyspark.ml.feature import StopWordsRemover
import matplotlib.pyplot as plt
import itertools
import numpy as np
import pickle
import random


In [2]:
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix

    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']

    title:        the text to display at the top of the matrix

    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues

    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph

    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """


    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()

In [3]:
# Import Pickle
with open('filename.pickle', 'rb') as handle: 
    b = pickle.load(handle) 

listData = b['DF']
random.shuffle(listData)
random.shuffle(listData)

In [4]:
# Create Spark Comtext"
sc = SparkContext.getOrCreate() 
spark = SQLContext(sc) 

In [5]:
# Load Data in spark data frame 
sentenceData = spark.createDataFrame(listData)



In [7]:
# Rename column label since label is used in further down pipeline
sentenceData = sentenceData.withColumnRenamed("label","category")
sentenceData.show()

+--------------------+--------+
|             article|category|
+--------------------+--------+
|When I was 7, my ...|  Sports|
|Pat DiNizio, the ...|   Music|
|Our guide to pop ...|   Music|
|EDUCATED A Memoir...|Politics|
|Patience may pay ...|  Sports|
|Maryland could be...|   Music|
|WASHINGTON —  The...|Politics|
|“I scream my lung...|   Music|
|Our guide to the ...|   Music|
|SAN FRANCISCO —  ...|Business|
|PALM HARBOR, Fla....|  Sports|
|In October, I wro...|Business|
|BOSTON — Look, I ...|  Sports|
|New York Universi...|  Sports|
|WASHINGTON — Robu...|Business|
|The New York Time...|Business|
|LIMA, Peru —  Pan...|Politics|
|The Library of Co...|   Music|
|CHICAGO —  The La...|Politics|
|NEW YORK —  The N...|  Sports|
+--------------------+--------+
only showing top 20 rows



In [8]:
# Train Test split
training, test = sentenceData.randomSplit([0.8, 0.2], seed=12345)

In [9]:

tokenizer = Tokenizer(inputCol="article",
                      outputCol="words") 

remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), 
                           outputCol="filtered")

hashingTF = HashingTF(inputCol=remover.getOutputCol(), 
                      outputCol="features") 

label_stringIdx = StringIndexer(inputCol = "category", 
                                outputCol = "label").fit(training)

In [None]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.classification import LinearSVC

In [34]:
lsvc = LinearSVC(maxIter=10, regParam=0.1,featuresCol=hashingTF.getOutputCol())

In [35]:
pipeline = Pipeline(stages=[tokenizer,
                               remover, hashingTF, 
                               label_stringIdx,
                               lsvc]) 


In [36]:
paramGrid = ParamGridBuilder() \
    .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
    .addGrid(lsvc.maxIter,[10,20]) \
    .build()

In [37]:
crossval = CrossValidator(estimator=pipeline, 
                          estimatorParamMaps=paramGrid, 
                          evaluator=MulticlassClassificationEvaluator(), 
                          numFolds=2)

In [38]:
cvModel = crossval.fit(training) 

IllegalArgumentException: 'requirement failed: LinearSVC only supports binary classification. 4 classes detected in LinearSVC_450d90dec4868f7eeed0__labelCol'