### set up pyspark
#### source: https://spark.apache.org/docs/1.6.1/api/python/pyspark.ml.html#pyspark.ml.classification.LogisticRegression

In [1]:
import os
import sys

spark_path = "C:/stack/spark-1.6.2-bin-hadoop2.6"

os.environ['SPARK_HOME'] = spark_path
os.environ['HADOOP_HOME'] = spark_path
os.environ["HADOOP_USER_NAME"] = "weizhong"

sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.9-src.zip")

from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext

sc = SparkContext()
sqlCtx = SQLContext(sc)

### create training dataset

In [3]:
from pyspark.sql import Row
from pyspark.mllib.linalg import Vectors

df = sc.parallelize([
        Row(label = 1.0, weight = 2.0, features = Vectors.dense(1.0)),
        Row(label = 0.0, weight = 2.0, features = Vectors.sparse(1, [], []))
    ]).toDF()

### logistic regression object

In [5]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter = 5, regParam = 0.01, weightCol = "weight")

### fit the model

In [6]:
model = lr.fit(df)

### look at the coefficients

In [11]:
print(model.coefficients)
print(model.intercept)

[5.53821880441]
-2.6858231547025317


### create a testing set

In [13]:
test0 = sc.parallelize([
        Row(features = Vectors.dense(-1.0))
    ]).toDF()

### make prediction on testing set

In [17]:
result = model.transform(test0).head()
print(result.prediction)
print(result.probability)
print(result.rawPrediction)

0.0
[0.999731942769,0.000268057230858]
[8.22404195911,-8.22404195911]


### create another testing set

In [18]:
test1 = sc.parallelize([
        Row(features = Vectors.sparse(1, [0], [1.0]))
    ]).toDF()

### make prediction

In [20]:
model.transform(test1).head().prediction

1.0

In [21]:
lr.setParams("vector")

TypeError: Method setParams forces keyword arguments.

In [22]:
sc.stop()