In [146]:
# import libraries
import findspark
findspark.init()
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import RandomForestClassifier, LinearSVC,OneVsRest,MultilayerPerceptronClassifier
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import HashingTF, IDF
#import featureExtractor.tfidf as tfidf
from functools import reduce
from nltk.corpus import stopwords

try:
    sc.stop()
except NameError:
    print(":)")
sc = SparkContext()
sqlContext = SQLContext(sc)

def readData(dataPath):
    #input: file path 
    #output: returns a spark dataframe 
    data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(dataPath)
    print("Data Read successfully ...")
    drop_list = ['url']
    
    data = data.select([column for column in data.columns if column not in drop_list])
    return data.dropna()



def tfidf(regexTokenizer, stopwordsRemover, label_stringIdx):
    #returns pipeline using tfidf 
    hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=5000)
    idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
    return Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])

def countVectorizer(regexTokenizer, stopwordsRemover, label_stringIdx):
    # bag of words count
    countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)
    label_stringIdx = StringIndexer(inputCol = "Category", outputCol = "label")
    return Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])   


def get_stopwords():
    stop_words = set(stopwords.words('english'))
    special = ['ms','mr','http','https','amp', 'none', 'i’m', 'th','don’t','it’s','advertisement']
    with open('seostopwords.txt') as f:
        stopLines = f.read().splitlines()

    dump = [stop_words.add(i) for i in special]
    dump = [stop_words.add(i) for i in stopLines]
    return list(stop_words)
    
def classify(featureExtractor, classifier):
    #input : 
        #1)feature extractor (count vectorizer or tfidf)
        #2) classifier algorithm (random forest, naive bayes or linear reg)
        
    # training :70% , testing 15% , validation 15%
    
    
    #Part 1:  Combine data from all sources 
    data_merged = readData('politics.csv')
    
    fnames = ['sports.csv','business.csv','movies.csv','tourism.csv','technology.csv',\
              'assault.csv','students and education.csv', 'terrorism.csv', 'med_and_health.csv',]
    
    for i in fnames:
        temp = readData(i)
        data_merged = data_merged.union(temp)
       
    data = data_merged.withColumnRenamed("category", "Category").withColumnRenamed("content", "Descript")
    print("The total number of articles from ", len(fnames)+1 ," categories is " , data.count())
        
    # Part 2 -> build regex tokenizer 
    regexTokenizer = RegexTokenizer(inputCol="Descript", outputCol="words", pattern="\\W")
    
    # stop words ---------- (3)
    add_stopwords = get_stopwords()
    #print(add_stopwords)
    
    # Part 3 -> build stop word tokenizer 
    stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered",stopWords=add_stopwords)
    #print(stopwordsRemover.transform())
    label_stringIdx = StringIndexer(inputCol = "Category", outputCol = "label")

    # Part 4 -> build pipeline 
    if featureExtractor=="cv":
        pipeline = countVectorizer(regexTokenizer, stopwordsRemover, label_stringIdx)
    elif featureExtractor=="tfidf":
        pipeline = tfidf(regexTokenizer, stopwordsRemover, label_stringIdx)
    print("Pipeline constructed successfully..")

    # Fit the pipeline to the model.
    pipelineFit = pipeline.fit(data)
    
    dataset = pipelineFit.transform(data)
    #dataset.show(1)
    dataset_row = str(dataset.select('features').take(1))
    num_feat = int(re.findall('(\d+?), \{',dataset_row)[0])
    
    # Part 5 -> split data and classify
    (trainingData, validationData, testData) = dataset.randomSplit([0.7, 0.15, 0.15], seed = 100)
    
    
    print("Training model..")
    if classifier=="lr":
        selectedModel = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0)
    elif classifier=="nb":
        selectedModel = NaiveBayes(smoothing=1)
    elif classifier=="rf":
        selectedModel = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 300, \
                            maxDepth = 20, \
                            maxBins = 2)
    elif classifier == 'perc':
        selectedModel = MultilayerPerceptronClassifier(maxIter=100, layers=[num_feat,156,75,10],\
                                                       blockSize=256, seed=1234)
    #elif classifier == 'svm':
    #    selectedModel = OneVsRest(classifier = LinearSVC(maxIter=10, regParam=0.1))

    model = selectedModel.fit(trainingData)
    print("Model Trained..")
    
    # part 6 -> predict and test model
    print("Test model..")
    predictions = model.transform(validationData)
    predictions.filter(predictions['prediction'] == 0) \
        .select("Descript","Category","probability","label","prediction") \
        .orderBy("probability", ascending=False)

    print("Evaluate model..")
    evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
    accuracy = evaluator.evaluate(predictions)
    print("Accuracy: "+str(accuracy))

    

In [147]:
def main():
    print("Please select method for feature extraction. Type\ncv for count vectorizer\ntfidf for term \
frequency inverse document frequency")
    fe_choice = 'cv'
    fe_choice = input()
    print("\nPlese enter algorithm for classification. Type\nlr for linear regression\nnb for naive bayes\n\
rf for random forest")
    algo_choice = 'nb'
    algo_choice = input()
    classify(fe_choice,algo_choice)
    
    

In [148]:
main()

Please select method for feature extraction. Type
cv for count vectorizer
tfidf for term frequency inverse document frequency
cv

Plese enter algorithm for classification. Type
lr for linear regression
nb for naive bayes
rf for random forest
svm
Data Read successfully ...
Data Read successfully ...
Data Read successfully ...
Data Read successfully ...
Data Read successfully ...
Data Read successfully ...
Data Read successfully ...
Data Read successfully ...
Data Read successfully ...
Data Read successfully ...
The total number of articles from  10  categories is  467
Pipeline constructed successfully..
Training model..
Model Trained..
Test model..


AnalysisException: "cannot resolve '`probability`' given input columns: [words, label, prediction, Category, filtered, Descript, features];;\n'Project [Descript#33626, Category#33623, 'probability, label#33737, prediction#34758]\n+- AnalysisBarrier\n      +- Filter (prediction#34758 = cast(0 as double))\n         +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, prediction#34758]\n            +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34750, <lambda>(mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34750) AS prediction#34758]\n               +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$tmpf8e418f5-b5a0-4f06-a127-7fa80a3a6dd8#34733 AS mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34750]\n                  +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$tmpf8e418f5-b5a0-4f06-a127-7fa80a3a6dd8#34733]\n                     +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, rawPrediction#34697, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34689, <lambda>(mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34689, rawPrediction#34697) AS mbc$tmpf8e418f5-b5a0-4f06-a127-7fa80a3a6dd8#34733]\n                        +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, rawPrediction#34697, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34689]\n                           +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34689, rawPrediction#34697, UDF(rawPrediction#34697) AS prediction#34706]\n                              +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34689, UDF(features#33731) AS rawPrediction#34697]\n                                 +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$tmpdb729cc7-34f4-46ea-92c0-01d0619b0282#34672 AS mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34689]\n                                    +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$tmpdb729cc7-34f4-46ea-92c0-01d0619b0282#34672]\n                                       +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, rawPrediction#34636, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34628, <lambda>(mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34628, rawPrediction#34636) AS mbc$tmpdb729cc7-34f4-46ea-92c0-01d0619b0282#34672]\n                                          +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, rawPrediction#34636, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34628]\n                                             +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34628, rawPrediction#34636, UDF(rawPrediction#34636) AS prediction#34645]\n                                                +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34628, UDF(features#33731) AS rawPrediction#34636]\n                                                   +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$tmp8016e523-eb2f-48fb-b058-60851a439699#34611 AS mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34628]\n                                                      +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$tmp8016e523-eb2f-48fb-b058-60851a439699#34611]\n                                                         +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, rawPrediction#34575, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34567, <lambda>(mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34567, rawPrediction#34575) AS mbc$tmp8016e523-eb2f-48fb-b058-60851a439699#34611]\n                                                            +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, rawPrediction#34575, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34567]\n                                                               +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34567, rawPrediction#34575, UDF(rawPrediction#34575) AS prediction#34584]\n                                                                  +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34567, UDF(features#33731) AS rawPrediction#34575]\n                                                                     +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$tmp0b2e8a11-995b-47c1-a5f2-28da21793b8e#34550 AS mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34567]\n                                                                        +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$tmp0b2e8a11-995b-47c1-a5f2-28da21793b8e#34550]\n                                                                           +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, rawPrediction#34514, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34506, <lambda>(mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34506, rawPrediction#34514) AS mbc$tmp0b2e8a11-995b-47c1-a5f2-28da21793b8e#34550]\n                                                                              +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, rawPrediction#34514, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34506]\n                                                                                 +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34506, rawPrediction#34514, UDF(rawPrediction#34514) AS prediction#34523]\n                                                                                    +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34506, UDF(features#33731) AS rawPrediction#34514]\n                                                                                       +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$tmp3d84902e-6c98-472b-97ee-62af87e3fa14#34489 AS mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34506]\n                                                                                          +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$tmp3d84902e-6c98-472b-97ee-62af87e3fa14#34489]\n                                                                                             +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, rawPrediction#34453, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34445, <lambda>(mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34445, rawPrediction#34453) AS mbc$tmp3d84902e-6c98-472b-97ee-62af87e3fa14#34489]\n                                                                                                +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, rawPrediction#34453, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34445]\n                                                                                                   +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34445, rawPrediction#34453, UDF(rawPrediction#34453) AS prediction#34462]\n                                                                                                      +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34445, UDF(features#33731) AS rawPrediction#34453]\n                                                                                                         +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$tmp136e2ade-4211-417c-87e5-5cd84c79927e#34428 AS mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34445]\n                                                                                                            +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$tmp136e2ade-4211-417c-87e5-5cd84c79927e#34428]\n                                                                                                               +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, rawPrediction#34392, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34384, <lambda>(mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34384, rawPrediction#34392) AS mbc$tmp136e2ade-4211-417c-87e5-5cd84c79927e#34428]\n                                                                                                                  +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, rawPrediction#34392, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34384]\n                                                                                                                     +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34384, rawPrediction#34392, UDF(rawPrediction#34392) AS prediction#34401]\n                                                                                                                        +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34384, UDF(features#33731) AS rawPrediction#34392]\n                                                                                                                           +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$tmpf513574a-1654-4b6a-941e-421dac47b62f#34367 AS mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34384]\n                                                                                                                              +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$tmpf513574a-1654-4b6a-941e-421dac47b62f#34367]\n                                                                                                                                 +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, rawPrediction#34331, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34323, <lambda>(mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34323, rawPrediction#34331) AS mbc$tmpf513574a-1654-4b6a-941e-421dac47b62f#34367]\n                                                                                                                                    +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, rawPrediction#34331, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34323]\n                                                                                                                                       +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34323, rawPrediction#34331, UDF(rawPrediction#34331) AS prediction#34340]\n                                                                                                                                          +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34323, UDF(features#33731) AS rawPrediction#34331]\n                                                                                                                                             +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$tmpe20820ba-4be6-4c77-8ea4-96d39141341e#34306 AS mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34323]\n                                                                                                                                                +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$tmpe20820ba-4be6-4c77-8ea4-96d39141341e#34306]\n                                                                                                                                                   +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, rawPrediction#34270, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34262, <lambda>(mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34262, rawPrediction#34270) AS mbc$tmpe20820ba-4be6-4c77-8ea4-96d39141341e#34306]\n                                                                                                                                                      +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, rawPrediction#34270, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34262]\n                                                                                                                                                         +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34262, rawPrediction#34270, UDF(rawPrediction#34270) AS prediction#34279]\n                                                                                                                                                            +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34262, UDF(features#33731) AS rawPrediction#34270]\n                                                                                                                                                               +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$tmp983c218c-53d0-4fb7-9b32-c84767f3d62e#34245 AS mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34262]\n                                                                                                                                                                  +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$tmp983c218c-53d0-4fb7-9b32-c84767f3d62e#34245]\n                                                                                                                                                                     +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, rawPrediction#34209, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34201, <lambda>(mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34201, rawPrediction#34209) AS mbc$tmp983c218c-53d0-4fb7-9b32-c84767f3d62e#34245]\n                                                                                                                                                                        +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, rawPrediction#34209, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34201]\n                                                                                                                                                                           +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34201, rawPrediction#34209, UDF(rawPrediction#34209) AS prediction#34218]\n                                                                                                                                                                              +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34201, UDF(features#33731) AS rawPrediction#34209]\n                                                                                                                                                                                 +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$tmp2b47f7d2-8ec9-4915-85aa-3ea635dcb9c9#34184 AS mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34201]\n                                                                                                                                                                                    +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$tmp2b47f7d2-8ec9-4915-85aa-3ea635dcb9c9#34184]\n                                                                                                                                                                                       +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, rawPrediction#34148, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34140, <lambda>(mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34140, rawPrediction#34148) AS mbc$tmp2b47f7d2-8ec9-4915-85aa-3ea635dcb9c9#34184]\n                                                                                                                                                                                          +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, rawPrediction#34148, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34140]\n                                                                                                                                                                                             +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34140, rawPrediction#34148, UDF(rawPrediction#34148) AS prediction#34157]\n                                                                                                                                                                                                +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34140, UDF(features#33731) AS rawPrediction#34148]\n                                                                                                                                                                                                   +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, label#33737, <lambda>(Descript#33626) AS mbc$acc668beb02-be22-4f0c-a813-d1cfa7d9b716#34140]\n                                                                                                                                                                                                      +- Sample 0.7, 0.85, false, 100\n                                                                                                                                                                                                         +- Sort [Descript#33626 ASC NULLS FIRST, Category#33623 ASC NULLS FIRST, words#33722 ASC NULLS FIRST, filtered#33726 ASC NULLS FIRST, features#33731 ASC NULLS FIRST, label#33737 ASC NULLS FIRST], false\n                                                                                                                                                                                                            +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, features#33731, UDF(cast(Category#33623 as string)) AS label#33737]\n                                                                                                                                                                                                               +- Project [Descript#33626, Category#33623, words#33722, filtered#33726, UDF(filtered#33726) AS features#33731]\n                                                                                                                                                                                                                  +- Project [Descript#33626, Category#33623, words#33722, UDF(words#33722) AS filtered#33726]\n                                                                                                                                                                                                                     +- Project [Descript#33626, Category#33623, UDF(Descript#33626) AS words#33722]\n                                                                                                                                                                                                                        +- Project [content#33406 AS Descript#33626, Category#33623]\n                                                                                                                                                                                                                           +- Project [content#33406, category#33407 AS Category#33623]\n                                                                                                                                                                                                                              +- Union\n                                                                                                                                                                                                                                 :- Filter AtLeastNNulls(n, content#33406,category#33407)\n                                                                                                                                                                                                                                 :  +- Project [content#33406, category#33407]\n                                                                                                                                                                                                                                 :     +- Relation[url#33405,content#33406,category#33407] csv\n                                                                                                                                                                                                                                 :- Filter AtLeastNNulls(n, content#33427,category#33428)\n                                                                                                                                                                                                                                 :  +- Project [content#33427, category#33428]\n                                                                                                                                                                                                                                 :     +- Relation[url#33426,content#33427,category#33428] csv\n                                                                                                                                                                                                                                 :- Filter AtLeastNNulls(n, content#33450,category#33451)\n                                                                                                                                                                                                                                 :  +- Project [content#33450, category#33451]\n                                                                                                                                                                                                                                 :     +- Relation[url#33449,content#33450,category#33451] csv\n                                                                                                                                                                                                                                 :- Filter AtLeastNNulls(n, content#33473,category#33474)\n                                                                                                                                                                                                                                 :  +- Project [content#33473, category#33474]\n                                                                                                                                                                                                                                 :     +- Relation[url#33472,content#33473,category#33474] csv\n                                                                                                                                                                                                                                 :- Filter AtLeastNNulls(n, content#33496,category#33497)\n                                                                                                                                                                                                                                 :  +- Project [content#33496, category#33497]\n                                                                                                                                                                                                                                 :     +- Relation[url#33495,content#33496,category#33497] csv\n                                                                                                                                                                                                                                 :- Filter AtLeastNNulls(n, content#33519,category#33520)\n                                                                                                                                                                                                                                 :  +- Project [content#33519, category#33520]\n                                                                                                                                                                                                                                 :     +- Relation[url#33518,content#33519,category#33520] csv\n                                                                                                                                                                                                                                 :- Filter AtLeastNNulls(n, content#33542,category#33543)\n                                                                                                                                                                                                                                 :  +- Project [content#33542, category#33543]\n                                                                                                                                                                                                                                 :     +- Relation[url#33541,content#33542,category#33543] csv\n                                                                                                                                                                                                                                 :- Filter AtLeastNNulls(n, content#33565,category#33566)\n                                                                                                                                                                                                                                 :  +- Project [content#33565, category#33566]\n                                                                                                                                                                                                                                 :     +- Relation[url#33564,content#33565,category#33566] csv\n                                                                                                                                                                                                                                 :- Filter AtLeastNNulls(n, content#33588,category#33589)\n                                                                                                                                                                                                                                 :  +- Project [content#33588, category#33589]\n                                                                                                                                                                                                                                 :     +- Relation[url#33587,content#33588,category#33589] csv\n                                                                                                                                                                                                                                 +- Filter AtLeastNNulls(n, content#33611,category#33612)\n                                                                                                                                                                                                                                    +- Project [content#33611, category#33612]\n                                                                                                                                                                                                                                       +- Relation[url#33610,content#33611,category#33612] csv\n"

In [72]:
    cv    tfidf
lr 0.621  0.657
nb 0.822  0.782
rf 0.707  0.656
mp 0.796  0.786

SyntaxError: invalid syntax (<ipython-input-72-d1534a5b37fa>, line 1)