In [1]:
#!aws s3 cp s3://msan694-group/final_nba.csv final_nba.csv
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import GBTClassifier
from pyspark.ml import Pipeline

In [2]:
sc

<pyspark.context.SparkContext at 0x10466d6d0>

In [3]:
#rdd_vec = sc.textFile("final_nba.csv")
rdd_vec = sc.textFile("NBA_Int2.csv")
header = rdd_vec.first() #extract header
rdd_vec.take(5)

[u'label,shot_number,dribbles,touch_time,shot_dist,pts_type,close_def_dist,home,time,matching_pos,consec_shots,shooter_c,shooter_pf,shooter_pg,shooter_pg_sg,shooter_sf,shooter_sg,shooter_sg_pg,defender_c,defender_pf,defender_pf_sf,defender_pg,defender_pg_sg,defender_sf,defender_sf_pf,defender_sf_sg,defender_sg,defender_sg_pg,defender_sg_sf,dps,shot_clock,shotnum_shotdist,dribble_touch,shotdist_defdist,touch_shotdist',
 u'1,1,2,1.9,7.7,2,1.3,0,10.85,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1.052631579,10.8,7.7,3.8,10.01,14.63',
 u'0,2,0,0.8,28.2,3,6.1,0,11.76666667,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,3.4,56.4,0,172.02,22.56',
 u'0,3,3,2.7,10.1,2,0.9,0,12,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1.111111111,0,30.3,8.1,9.09,27.27',
 u'0,4,2,1.9,17.2,2,3.4,0,12.21666667,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1.052631579,10.3,68.8,3.8,58.48,32.68']

In [4]:
data = rdd_vec.filter(lambda row: row not in header) #filter out the header!
new_rdd = data.map(lambda line: line.split(',')) # split the wide vector by ","
new_rdd.count()

128069

In [5]:
#turn to Vectors.dense (with label out front)
split_rdd = new_rdd.map(lambda line: (float(line[0]), Vectors.dense([float(c) for c in line[1:len(line)]]))) 

# Create the DataFrame from the collected RDD
full_df = sqlContext.createDataFrame(split_rdd.collect(), ["label", "features"])
full_df.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  1.0|[1.0,2.0,1.9,7.7,...|
|  0.0|[2.0,0.0,0.8,28.2...|
|  0.0|[3.0,3.0,2.7,10.1...|
|  0.0|[4.0,2.0,1.9,17.2...|
|  0.0|[5.0,2.0,2.7,3.7,...|
|  0.0|[6.0,2.0,4.4,18.4...|
|  0.0|[7.0,11.0,9.0,20....|
|  1.0|[8.0,3.0,2.5,3.5,...|
|  0.0|[9.0,0.0,0.8,24.6...|
|  0.0|[1.0,0.0,1.1,22.4...|
|  0.0|[2.0,8.0,7.5,24.5...|
|  1.0|[3.0,14.0,11.9,14...|
|  1.0|[4.0,2.0,2.9,5.9,...|
|  0.0|[1.0,0.0,0.8,26.4...|
|  0.0|[1.0,0.0,0.5,22.8...|
|  1.0|[2.0,3.0,2.7,24.7...|
|  0.0|[3.0,6.0,5.1,25.0...|
|  0.0|[4.0,1.0,0.9,25.6...|
|  1.0|[5.0,0.0,1.2,24.2...|
|  0.0|[1.0,2.0,2.2,25.4...|
+-----+--------------------+
only showing top 20 rows



In [6]:
%%time
(trainingData, testData) = full_df.randomSplit([0.7, 0.3])
trainingData = trainingData.cache()
testData = testData.cache()

CPU times: user 2.05 ms, sys: 1.21 ms, total: 3.26 ms
Wall time: 144 ms


In [7]:
trainingData.first()

Row(label=0.0, features=DenseVector([1.0, 0.0, 0.0, 0.1, 2.0, 1.6, 0.0, 9.9333, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.1, 0.0, 0.16, 0.0]))

# GBT

In [8]:
gbt = GBTClassifier(maxIter=2, maxDepth=2, labelCol="label", maxBins=3)
pipeline = Pipeline(stages=[gbt])
model = pipeline.fit(trainingData)

In [9]:
# Make predictions.
predictions = model.transform(testData)
predictions.show(20)

+-----+--------------------+----------+
|label|            features|prediction|
+-----+--------------------+----------+
|  0.0|[1.0,0.0,-0.3,18....|       0.0|
|  0.0|[1.0,0.0,0.0,2.2,...|       1.0|
|  0.0|[1.0,0.0,0.0,7.2,...|       0.0|
|  0.0|[1.0,0.0,0.0,13.7...|       0.0|
|  0.0|[1.0,0.0,0.0,24.5...|       0.0|
|  0.0|[1.0,0.0,0.1,26.6...|       0.0|
|  0.0|[1.0,0.0,0.3,3.2,...|       1.0|
|  0.0|[1.0,0.0,0.4,4.1,...|       1.0|
|  0.0|[1.0,0.0,0.4,22.2...|       0.0|
|  0.0|[1.0,0.0,0.5,3.3,...|       1.0|
|  0.0|[1.0,0.0,0.5,25.6...|       0.0|
|  0.0|[1.0,0.0,0.6,15.8...|       0.0|
|  0.0|[1.0,0.0,0.6,24.4...|       0.0|
|  0.0|[1.0,0.0,0.6,24.9...|       0.0|
|  0.0|[1.0,0.0,0.7,1.4,...|       1.0|
|  0.0|[1.0,0.0,0.7,1.6,...|       1.0|
|  0.0|[1.0,0.0,0.7,2.7,...|       1.0|
|  0.0|[1.0,0.0,0.7,6.7,...|       0.0|
|  0.0|[1.0,0.0,0.7,17.1...|       0.0|
|  0.0|[1.0,0.0,0.7,21.2...|       0.0|
+-----+--------------------+----------+
only showing top 20 rows



In [10]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g" % (accuracy*100))

Accuracy = 59.8043


In [33]:
evaluator = MulticlassClassificationEvaluator(
labelCol="label", predictionCol="prediction", metricName="accuracy")

In [34]:
# n-fold validation and the results.
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
cv = CrossValidator().setEstimator(gbt).setEvaluator(evaluator).setNumFolds(5)
paramGrid = ParamGridBuilder().addGrid(gbt.maxDepth, [3,6]).addGrid(gbt.maxIter, [4,10]).build()
cv.setEstimatorParamMaps(paramGrid)
cvmodel = cv.fit(trainingData)

In [35]:
evaluator.evaluate(cvmodel.bestModel.transform(testData))

0.6168122553720936

In [15]:
cvmodel.bestModel

GBTClassificationModel (uid=GBTClassifier_448fb02ee2409dec56ba) with 20 trees

# Logistic Regression

In [12]:
#Train the model.
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(regParam=0.01, maxIter=1000, fitIntercept=True)
lrmodel = lr.fit(trainingData)
lrmodel = lr.setParams(regParam=0.01, maxIter=500, fitIntercept=True).fit(trainingData)

In [13]:
#Evaluate models using test dataset.
validpredicts = lrmodel.transform(testData)
validpredicts.show()

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|[1.0,0.0,-0.3,18....|    [NaN,NaN]|  [NaN,NaN]|       0.0|
|  0.0|[1.0,0.0,0.0,0.7,...|    [NaN,NaN]|  [NaN,NaN]|       0.0|
|  0.0|[1.0,0.0,0.0,1.2,...|    [NaN,NaN]|  [NaN,NaN]|       0.0|
|  0.0|[1.0,0.0,0.0,3.6,...|    [NaN,NaN]|  [NaN,NaN]|       0.0|
|  0.0|[1.0,0.0,0.0,7.2,...|    [NaN,NaN]|  [NaN,NaN]|       0.0|
|  0.0|[1.0,0.0,0.0,14.3...|    [NaN,NaN]|  [NaN,NaN]|       0.0|
|  0.0|[1.0,0.0,0.1,26.6...|    [NaN,NaN]|  [NaN,NaN]|       0.0|
|  0.0|[1.0,0.0,0.2,1.9,...|    [NaN,NaN]|  [NaN,NaN]|       0.0|
|  0.0|[1.0,0.0,0.2,2.8,...|    [NaN,NaN]|  [NaN,NaN]|       0.0|
|  0.0|[1.0,0.0,0.3,3.2,...|    [NaN,NaN]|  [NaN,NaN]|       0.0|
|  0.0|[1.0,0.0,0.4,14.8...|    [NaN,NaN]|  [NaN,NaN]|       0.0|
|  0.0|[1.0,0.0,0.5,22.8...|    [NaN,NaN]|  [NaN,NaN]|       0.0|
|  0.0|[1.

In [19]:
#Evaluate the model. default metric : Area Under ROC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
bceval = BinaryClassificationEvaluator()
print (bceval.getMetricName() +":" + str(bceval.evaluate(validpredicts)))

areaUnderROC:0.5


In [20]:
# n-fold validation and the results.
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
cv = CrossValidator().setEstimator(lr).setEvaluator(bceval).setNumFolds(5)
paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [1000]).addGrid(lr.regParam, [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5]).build()
cv.setEstimatorParamMaps(paramGrid)
cvmodel = cv.fit(trainingData)

In [28]:
BinaryClassificationEvaluator().evaluate(cvmodel.bestModel.transform(testData))

0.5

In [48]:
cvmodel.bestModel

GBTClassificationModel (uid=GBTClassifier_4b05842593e4a7f2e2ac) with 4 trees

In [57]:
from pyspark.mllib.evaluation import MulticlassMetrics
dtresrdd = validpredicts.select('prediction','label').rdd
dtmm =MulticlassMetrics(dtresrdd)
dtmm.precision()



0.6075466804979253

In [None]:
print(dtmm.confusionMatrix())

## Decision Tree

In [37]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=4)
dt_model = dt.fit(trainingData)
predictions = model.transform(testData)
predictions.show()

+-----+--------------------+----------+
|label|            features|prediction|
+-----+--------------------+----------+
|  0.0|[1.0,0.0,-0.3,18....|       0.0|
|  0.0|[1.0,0.0,0.0,2.2,...|       1.0|
|  0.0|[1.0,0.0,0.0,7.2,...|       0.0|
|  0.0|[1.0,0.0,0.0,13.7...|       0.0|
|  0.0|[1.0,0.0,0.0,24.5...|       0.0|
|  0.0|[1.0,0.0,0.1,26.6...|       0.0|
|  0.0|[1.0,0.0,0.3,3.2,...|       1.0|
|  0.0|[1.0,0.0,0.4,4.1,...|       1.0|
|  0.0|[1.0,0.0,0.4,22.2...|       0.0|
|  0.0|[1.0,0.0,0.5,3.3,...|       1.0|
|  0.0|[1.0,0.0,0.5,25.6...|       0.0|
|  0.0|[1.0,0.0,0.6,15.8...|       0.0|
|  0.0|[1.0,0.0,0.6,24.4...|       0.0|
|  0.0|[1.0,0.0,0.6,24.9...|       0.0|
|  0.0|[1.0,0.0,0.7,1.4,...|       1.0|
|  0.0|[1.0,0.0,0.7,1.6,...|       1.0|
|  0.0|[1.0,0.0,0.7,2.7,...|       1.0|
|  0.0|[1.0,0.0,0.7,6.7,...|       0.0|
|  0.0|[1.0,0.0,0.7,17.1...|       0.0|
|  0.0|[1.0,0.0,0.7,21.2...|       0.0|
+-----+--------------------+----------+
only showing top 20 rows



In [47]:
from pyspark.sql.functions import *
predictions.select("prediction", "label", "features").show(10)
print predictions.select(avg("prediction")).show()
print predictions.select(avg("label")).show()

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|[1.0,0.0,-0.3,18....|
|       1.0|  0.0|[1.0,0.0,0.0,2.2,...|
|       0.0|  0.0|[1.0,0.0,0.0,7.2,...|
|       0.0|  0.0|[1.0,0.0,0.0,13.7...|
|       0.0|  0.0|[1.0,0.0,0.0,24.5...|
|       0.0|  0.0|[1.0,0.0,0.1,26.6...|
|       1.0|  0.0|[1.0,0.0,0.3,3.2,...|
|       1.0|  0.0|[1.0,0.0,0.4,4.1,...|
|       0.0|  0.0|[1.0,0.0,0.4,22.2...|
|       1.0|  0.0|[1.0,0.0,0.5,3.3,...|
+----------+-----+--------------------+
only showing top 10 rows

+-----------------+
|  avg(prediction)|
+-----------------+
|0.333549340314679|
+-----------------+

None
+-------------------+
|         avg(label)|
+-------------------+
|0.45042639778117627|
+-------------------+

None


In [24]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction")

accuracy = evaluator.evaluate(predictions)


print("Accuracy = %g" % (accuracy*100))

Accuracy = 59.8043


In [15]:
dt_model.toDebugString

u'DecisionTreeClassificationModel (uid=DecisionTreeClassifier_45e68365f132a6343543) of depth 4 with 31 nodes\n  If (feature 3 <= 5.8)\n   If (feature 5 <= 2.9)\n    If (feature 2 <= 1.5)\n     If (feature 5 <= 2.1)\n      Predict: 1.0\n     Else (feature 5 > 2.1)\n      Predict: 1.0\n    Else (feature 2 > 1.5)\n     If (feature 5 <= 2.3)\n      Predict: 0.0\n     Else (feature 5 > 2.3)\n      Predict: 1.0\n   Else (feature 5 > 2.9)\n    If (feature 5 <= 4.6)\n     If (feature 33 <= 5.46)\n      Predict: 1.0\n     Else (feature 33 > 5.46)\n      Predict: 1.0\n    Else (feature 5 > 4.6)\n     If (feature 2 <= 4.1)\n      Predict: 1.0\n     Else (feature 2 > 4.1)\n      Predict: 1.0\n  Else (feature 3 > 5.8)\n   If (feature 29 <= 3.1)\n    If (feature 3 <= 25.2)\n     If (feature 3 <= 8.4)\n      Predict: 0.0\n     Else (feature 3 > 8.4)\n      Predict: 0.0\n    Else (feature 3 > 25.2)\n     If (feature 28 <= 0.5)\n      Predict: 0.0\n     Else (feature 28 > 0.5)\n      Predict: 0.0\n   E

In [4]:
print header.split(",")[3]
print header.split(",")[5]
print header.split(",")[2]
print header.split(",")[4]

TOUCH_TIME
PTS_TYPE
DRIBBLES
SHOT_DIST


+ If Touch_time <= 5.1
+ If player shoots for 2, not 3
+ If dribbles < 1.4
+ **predict 1**

In [26]:
dt_model.featureImportances

SparseVector(34, {2: 0.054, 3: 0.6169, 5: 0.255, 28: 0.0051, 29: 0.042, 33: 0.027})

In [25]:
print header.split(",")[2]
print header.split(",")[3]
print header.split(",")[5]
print header.split(",")[28]
print header.split(",")[29]
print header.split(",")[33]

dribbles
touch_time
pts_type
defender_sg_sf
dps
shotdist_defdist


In [29]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(maxDepth = 10,numTrees=10)
rfmodel = rf.fit(trainingData)
rfpredicts = rfmodel.transform(testData)

In [30]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction")
accuracy = evaluator.evaluate(rfpredicts)
print("Accuracy = %g" % (accuracy*100))

Accuracy = 59.4074


In [31]:
from pyspark.mllib.evaluation import MulticlassMetrics
rfpredicts = rfmodel.transform(testData)
rfresrdd = rfpredicts.select('prediction','label').rdd
rfmm = MulticlassMetrics(rfresrdd)
rfmm.precision()



0.6198709142279478

In [32]:
print rfmm.confusionMatrix()

DenseMatrix([[ 17797.,   3405.],
             [ 11260.,   6117.]])
