In [1]:
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pyspark as spark
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
from pyspark import SQLContext
import os
import csv
from io import StringIO
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from nltk.stem.snowball import SnowballStemmer


In [None]:
conf = SparkConf().setAppName('app').setMaster('local')
sc = SparkContext(conf=conf)
hc = SparkSession(sc)
sqlContext = SQLContext(sc)

In [14]:
train = sqlContext.read.csv("small/train.csv", header=True)
train = train.withColumn("group", train['group'].cast('int'))
train = train.withColumn("price", train['price'].cast('float'))

In [73]:
tokenizer = Tokenizer(inputCol="description", outputCol="words")
#wordsData = tokenizer.transform(train)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=2500)
#tf = hashingTF.transform(wordsData)

idf = IDF(inputCol="rawFeatures", outputCol="features")
#idfModel = idf.fit(tf)
#tfidf = idfModel.transform(tf)
assembler = VectorAssembler(inputCols=["price","features"],handleInvalid = "skip",outputCol="all_features")
#tfidf = assembler.transform(tfidf)

In [75]:
lr = LogisticRegression(featuresCol="all_features", labelCol='group', regParam=0.1)
#lrModel = lr.fit(tfidf)

In [76]:
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, assembler, lr])
model = pipeline.fit(train)

In [77]:
res_train = model.transform(train)
#res_train.show(5)

In [90]:
model.save("lr.model")

In [64]:
#res_train.select(['group','rawPrediction','probability','prediction']).show(5)

In [65]:
# def row2csv(row):
#     buffer = StringIO()
#     writer = csv.writer(buffer)
#     writer.writerow([str(s).encode("utf-8") for s in row])
#     buffer.seek(0)
#     return buffer.read().strip()

# res_train.select(['group','rawPrediction','probability','prediction']).rdd.map(row2csv).coalesce(1).saveAsTextFile("pred_sample.csv")

In [84]:
def evaluate_results(res):    
    res = res.withColumn("group", res['group'].cast('float'))
    metrics = MulticlassMetrics(res.select(['group','prediction']).rdd)
    print("Accuracy = %s" % metrics.accuracy)
evaluate_results(res_train)

Accuracy = 0.8723837999456374


In [88]:
test = sqlContext.read.csv("small/test.csv", header=True)
test = test.withColumn("group", test['group'].cast('int'))
test = test.withColumn("price", test['price'].cast('float'))

In [94]:
from pyspark.ml import PipelineModel
model2 = PipelineModel.load('lr.model')

In [95]:
res_test = model2.transform(test)
evaluate_results(res_test)

Accuracy = 0.6801258521237545
