# Fire up GraphLab Create

In [134]:

from __future__ import division
import graphlab
import math
import string
import random
import numpy
from matplotlib import pyplot as plt
%matplotlib inline

# Dataset for practicing classification -use NBA rookie stats to predict if player will last 5 years in league

# ML Classification: Predict 5-Year Career Longevity for NBA Rookies

In [135]:

nba = graphlab.SFrame('nba_logreg1.csv')

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[int,str,int,float,float,float,float,float,float,float,float,float,float,float,float,float,float,float,float,float,float,int]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


### Let's perform a train/test split with 80% of the data in the training set and 20% of the data in the test set. 


In [136]:
train_data, test_data = nba.random_split(.8, seed=1)
print len(train_data)
print len(test_data)

1058
282


In [137]:
nba.head()

X1,name,gp,min,pts,fgm,fga,fg,3p_made,3pa,3p,ftm,fta,ft,oreb,dreb
0,Brandon Ingram,36,27.4,7.4,2.6,7.6,34.7,0.5,2.1,25.0,1.6,2.3,69.9,0.7,3.4
1,Andrew Harrison,35,26.9,7.2,2.0,6.7,29.6,0.7,2.8,23.5,2.6,3.4,76.5,0.5,2.0
2,JaKarr Sampson,74,15.3,5.2,2.0,4.7,42.2,0.4,1.7,24.4,0.9,1.3,67.0,0.5,1.7
3,Malik Sealy,58,11.6,5.7,2.3,5.5,42.6,0.1,0.5,22.6,0.9,1.3,68.9,1.0,0.9
4,Matt Geiger,48,11.5,4.5,1.6,3.0,52.4,0.0,0.1,0.0,1.3,1.9,67.4,1.0,1.5
5,Tony Bennett,75,11.4,3.7,1.5,3.5,42.3,0.3,1.1,32.5,0.4,0.5,73.2,0.2,0.7
6,Don MacLean,62,10.9,6.6,2.5,5.8,43.5,0.0,0.1,50.0,1.5,1.8,81.1,0.5,1.4
7,Tracy Murray,48,10.3,5.7,2.3,5.4,41.5,0.4,1.5,30.0,0.7,0.8,87.5,0.8,0.9
8,Duane Cooper,65,9.9,2.4,1.0,2.4,39.2,0.1,0.5,23.3,0.4,0.5,71.4,0.2,0.6
9,Dave Johnson,42,8.5,3.7,1.4,3.5,38.3,0.1,0.3,21.4,1.0,1.4,67.8,0.4,0.7

reb,ast,stl,blk,tov,target_5yrs
4.1,1.9,0.4,0.4,1.3,0
2.4,3.7,1.1,0.5,1.6,0
2.2,1.0,0.5,0.3,1.0,0
1.9,0.8,0.6,0.1,1.0,1
2.5,0.3,0.3,0.4,0.8,1
0.8,1.8,0.4,0.0,0.7,0
2.0,0.6,0.2,0.1,0.7,1
1.7,0.2,0.2,0.1,0.7,1
0.8,2.3,0.3,0.0,1.1,0
1.1,0.3,0.2,0.0,0.7,0


In [138]:
nba_model = graphlab.logistic_classifier.create(train_data,
                                                      target = 'target_5yrs',
                                                      features= ['gp','min','pts','fgm',
                                                                 'fga','fg','3p_made',
                                                                 '3pa','3p','ftm','fta','ft',
                                                                 'oreb','dreb',
                                                                 'reb','ast','stl',
                                                                 'blk','tov'],
                                                      validation_set=None)

In [139]:
nba_model

Class                          : LogisticClassifier

Schema
------
Number of coefficients         : 20
Number of examples             : 1058
Number of classes              : 2
Number of feature columns      : 19
Number of unpacked features    : 19

Hyperparameters
---------------
L1 penalty                     : 0.0
L2 penalty                     : 0.01

Training Summary
----------------
Solver                         : newton
Solver iterations              : 4
Solver status                  : SUCCESS: Optimal solution found.
Training time (sec)            : 0.0439

Settings
--------
Log-likelihood                 : 575.9823

Highest Positive Coefficients
-----------------------------
3p_made                        : 2.2927
reb                            : 0.7285
blk                            : 0.6814
ftm                            : 0.6717
ast                            : 0.2551

Lowest Negative Coefficients
----------------------------
(intercept)                    : -3.9697
dreb  

### Extract the weights (coefficients) 


In [140]:
weights = nba_model.coefficients
weights.column_names()

['name', 'index', 'class', 'value', 'stderr']

In [141]:
num_positive_weights = (weights['value'] >=0).sum()
num_negative_weights = (weights['value']< 0).sum()

print "Number of positive weights: %s " % num_positive_weights
print "Number of negative weights: %s " % num_negative_weights

Number of positive weights: 11 
Number of negative weights: 9 


### Making predictions with logistic regression based on numbers of game played
### explore this in the context of 3 examples in the test dataset

In [142]:
sample_test_data = test_data[11:14]
print sample_test_data['gp']
sample_test_data

[55, 70, 64]


X1,name,gp,min,pts,fgm,fga,fg,3p_made,3pa,3p,ftm,fta,ft,oreb,dreb,reb
64,Gary Harris,55,13.1,3.4,1.2,3.9,30.4,0.4,1.9,20.4,0.6,0.9,74.5,0.4,0.8,1.2
82,Gerald Paddio,70,16.9,7.2,3.0,7.2,41.9,0.1,0.3,25.0,1.1,1.3,79.6,0.5,1.1,1.7
102,Tony Smith,64,10.9,3.7,1.5,3.4,44.1,0.0,0.1,0.0,0.6,0.9,70.2,0.4,0.7,1.1

ast,stl,blk,tov,target_5yrs
0.5,0.7,0.1,0.7,0
1.3,0.3,0.1,1.0,0
2.1,0.4,0.2,1.1,1


In [143]:
print sample_test_data[1]['name']

Gerald Paddio


In [144]:
print sample_test_data[0]['name']

Gary Harris


#### Make a class prediction for the sample_test_data. 
#### The nba_model should predict +1 if 
#### the 5-Year Career Longevity is positive 
#### and 0 if the 5-Year Career Longevity is negative.

In [146]:
scores = nba_model.predict(sample_test_data, output_type='margin')
print scores

[-1.1037849868003606, 0.9322432573109118, 0.32872649764073936]


### Predicting 5-Year Career Longevity

In [148]:
def class_predications(scores):
    predications = []
    for score in scores:
        if score > 0:
            predication = 1
        else:
            predication = 0
        predications.append(predication)
    return predications

In [149]:
class_predications(scores)

[0, 1, 1]

### Verify that the class predictions obtained by your calculations are the same 
### as that obtained from GraphLab Create.

In [150]:
print "Class predictions according to GraphLab Create:" 
print nba_model.predict(sample_test_data)

Class predictions according to GraphLab Create:
[0, 1, 1]


## Probability predictions

In [152]:
def calculate_probability(scores):
    probability_predictions = []
    for score in scores:
        probability_prediction = 1/(1+math.exp(-score))
        probability_predictions.append(probability_prediction)
    return probability_predictions

calculate_probability(scores)

[0.24903137386263335, 0.7175301720681214, 0.581449481875801]

In [153]:
print "Class predictions according to GraphLab Create:" 
print nba_model.predict(sample_test_data, output_type='probability')

Class predictions according to GraphLab Create:
[0.24903137386263335, 0.7175301720681214, 0.581449481875801]


## Find the most positive (and negative) Player

In [155]:
test_data['probability_predictions'] = nba_model.predict(test_data,
                                                              output_type='probability')
test_data

X1,name,gp,min,pts,fgm,fga,fg,3p_made,3pa,3p,ftm,fta,ft,oreb,dreb
8,Duane Cooper,65,9.9,2.4,1.0,2.4,39.2,0.1,0.5,23.3,0.4,0.5,71.4,0.2,0.6
9,Dave Johnson,42,8.5,3.7,1.4,3.5,38.3,0.1,0.3,21.4,1.0,1.4,67.8,0.4,0.7
14,Elmore Spencer,44,6.4,2.4,1.0,1.9,53.7,0.0,0.0,0.0,0.4,0.7,50.0,0.4,1.0
18,Larry Johnson,82,37.2,19.2,7.5,15.3,49.0,0.1,0.3,22.7,4.1,5.0,82.9,3.9,7.0
24,Mitch McGary,32,15.2,6.3,2.8,5.2,53.3,0.0,0.1,0.0,0.8,1.3,62.5,1.7,3.5
31,Stanley Roberts,55,20.3,10.4,4.3,8.1,52.9,0.0,0.0,0.0,1.8,3.6,51.5,2.1,4.1
32,Terrell Brandon,82,19.6,7.4,3.1,7.3,41.9,0.0,0.3,4.3,1.2,1.5,80.6,0.6,1.4
36,Greg Anthony,82,18.4,5.5,2.0,5.3,37.0,0.1,0.7,14.5,1.4,1.9,74.1,0.4,1.3
38,Victor Alexander,80,16.9,7.4,3.0,5.7,52.9,0.0,0.0,0.0,1.3,1.9,69.1,1.3,2.9
50,Greg Sutton,67,9.0,3.7,1.4,3.6,38.8,0.4,1.3,29.2,0.5,0.7,75.6,0.1,0.6

reb,ast,stl,blk,tov,target_5yrs,probability_predictions
0.8,2.3,0.3,0.0,1.1,0,0.525373288177
1.1,0.3,0.2,0.0,0.7,0,0.343556078329
1.4,0.2,0.2,0.4,0.6,1,0.385415889254
11.0,3.6,1.0,0.6,1.9,0,0.990362036179
5.2,0.4,0.5,0.5,1.0,0,0.439311216971
6.1,0.7,0.4,1.5,1.4,1,0.782622660642
2.0,3.9,1.0,0.3,1.7,1,0.809643159063
1.7,3.8,0.7,0.1,1.2,1,0.711180972401
4.2,0.4,0.6,0.8,1.1,1,0.828907016006
0.7,1.4,0.4,0.1,1.0,0,0.560987527026


In [156]:
test_data['name','probability_predictions'].topk('probability_predictions', k=20).print_rows(20)


+--------------------+-------------------------+
|        name        | probability_predictions |
+--------------------+-------------------------+
|    Elton Brand     |      0.990901541665     |
|   Larry Johnson    |      0.990362036179     |
|  Michael Jordan*   |      0.989205041864     |
|     Joe Smith      |      0.987778970778     |
|   Steve Francis    |      0.967252961868     |
| Russell Westbrook  |      0.96628789472      |
|   Bill Laimbeer    |      0.961548033104     |
|   Dwight Howard    |      0.960448870082     |
|   Kevin McHale*    |      0.953948175226     |
|     Lamar Odom     |      0.952591217854     |
|   Charles Smith    |      0.939466569628     |
|   Charles Smith    |      0.939466569628     |
|   Damian Lillard   |      0.938647086462     |
|    LeBron James    |      0.937892961976     |
|  Alvin Robertson   |      0.932638575871     |
|   Rodney McCray    |      0.92944136996      |
|    Sam Perkins     |      0.928951755094     |
|    Roy Tarpley    

In [115]:
test_data['name','probability_predictions'].topk('probability_predictions', k=20, reverse = True).print_rows(20)


+------------------+-------------------------+
|       name       | probability_predictions |
+------------------+-------------------------+
|   Caris LeVert   |     0.0937293701132     |
| Bruce Kuczenski  |     0.0979092756404     |
|  Anthony Brown   |      0.106318138738     |
|   Billy Thomas   |      0.137736096844     |
|  Nazr Mohammed   |      0.21451587321      |
|   Otto Porter    |      0.221350145852     |
|  Patrick McCaw   |      0.224477500353     |
|   Chris Garner   |      0.229347994333     |
| Johnny O'Bryant  |      0.23872620103      |
|   Solomon Hill   |      0.246305319319     |
|   Gary Harris    |      0.249031373863     |
|   Cory Higgins   |      0.250937147231     |
|   Jeremy Pargo   |      0.258814022021     |
|  Adreian Payne   |      0.259099672966     |
| Terrence Rencher |      0.261528378048     |
|  Andre Barrett   |      0.265464775851     |
|   Damon Jones    |      0.270164195667     |
|   Steve Novak    |      0.27267373029      |
|   Toby Bail

## Accuracy of the classifier

In [117]:
print graphlab.SArray([1,1,1]) == sample_test_data['target_5yrs']
print nba_model.predict(sample_test_data) == sample_test_data['target_5yrs']

[0, 0, 1]
[1, 0, 1]


In [118]:
def get_classification_accuracy(model, data, true_labels):
    
    predicitions = model.predict(data)
    
   
    num_correct = sum(predicitions == true_labels)

 
    accuracy = num_correct/len(data)
    
    return accuracy

In [119]:
get_classification_accuracy(nba_model, test_data, test_data['target_5yrs'])

0.6950354609929078

## Baseline: Majority class prediction


In [122]:
num_positive  = (train_data['target_5yrs'] == +1).sum()
num_negative = (train_data['target_5yrs'] == -0).sum()
print num_positive
print num_negative

655
403


In [123]:
print (test_data['target_5yrs'] == +1).sum()
print (test_data['target_5yrs'] == -0).sum()

176
106


In [124]:
print (test_data['target_5yrs'] == +1).sum()/len(test_data['target_5yrs'])


0.624113475177
