In [1]:
import math
from math import sqrt

from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, Row
from pyspark.sql.types import *
import pyspark.sql.functions as func

In [2]:
sc = SparkContext()

In [3]:
nba_data = sc.textFile("file:/home/jupyter/notebooks/nba_all.csv")
nba_data_header = nba_data.take(1)[0]

In [4]:
nba_data = nba_data.filter(lambda line: line!=nba_data_header).map(lambda line: line.split(","))
nba_data = nba_data.collect()

In [5]:
def nba_mean(dim) :
    Dimension = {'MIN':1,'PTS':2,'AST':3,'TOV':4,'eFG':5,'TS':6}
    a = []
    for x in nba_data :
        a.append(float(x[Dimension[dim]]))
    data_mean = sc.parallelize(a).mean()
    return data_mean

def nba_std(dim) :
    Dimension = {'MIN':1,'PTS':2,'AST':3,'TOV':4,'eFG':5,'TS':6}
    a = []
    for x in nba_data :
        a.append(float(x[Dimension[dim]]))
    data_std = sc.parallelize(a).stdev()
    return data_std
  
MIN_data_mean = nba_mean('MIN')
MIN_data_std = nba_std('MIN')
PTS_data_mean = nba_mean('PTS')
PTS_data_std = nba_std('PTS')
AST_data_mean = nba_mean('AST')
AST_data_std = nba_std('AST')
TOV_data_mean = nba_mean('TOV')
TOV_data_std = nba_std('TOV')
eFG_data_mean = nba_mean('eFG')
eFG_data_std = nba_std('eFG')
TS_data_mean = nba_mean('TS')
TS_data_std = nba_std('TS')
D1 = []

for x in nba_data:
    MIN = (((float(x[1])- MIN_data_mean) / MIN_data_std  ) - ((float(nba_data[0][1])- MIN_data_mean) / MIN_data_std  ))**2
    PTS = (((float(x[2])- PTS_data_mean) / PTS_data_std  ) - ((float(nba_data[0][2])- PTS_data_mean) / PTS_data_std  ))**2
    AST = (((float(x[3])- AST_data_mean) / AST_data_std  ) - ((float(nba_data[0][3])- AST_data_mean) / AST_data_std  ))**2
    TOV = (((float(x[4])- TOV_data_mean) / TOV_data_std  ) - ((float(nba_data[0][4])- TOV_data_mean) / TOV_data_std  ))**2
    eFG = (((float(x[5])- eFG_data_mean) / eFG_data_std  ) - ((float(nba_data[0][5])- eFG_data_mean) / eFG_data_std  ))**2
    TS =  (((float(x[6])- TS_data_mean) / TS_data_std  ) - ((float(nba_data[0][6])- TS_data_mean) / TS_data_std  ))**2
    distance = sqrt(MIN + PTS + AST + TOV + eFG + TS)
    name = str(x[0])
    c = name + ',' + str(distance)
    D1.append(c)


In [6]:
Rank = sc.parallelize(D1)
splits = Rank.map(lambda x: x.split(','))
sqlContext = SQLContext(sc)
Ranks = splits.map(lambda p: Row(name=p[0], distance=float(p[1])))
schemaPeople = sqlContext.createDataFrame(Ranks)

In [7]:
schemaPeople.show()

+------------------+-------------------+
|          distance|               name|
+------------------+-------------------+
|               0.0|     Michael Jordan|
| 2.930094812635345|   Wilt Chamberlain|
|3.2527939293064425|       Kevin Durant|
| 2.452747469988465|       Elgin Baylor|
| 2.393377221590684|       LeBron James|
|  2.45087960369704|         Jerry West|
|3.2314020864897763|      Allen Iverson|
|3.1476311918721507|         Bob Pettit|
|3.0837258294085528|      George Gervin|
|  3.59662892670863|    Oscar Robertson|
| 3.983627421640038|        Karl Malone|
| 4.005454854441199|        Kobe Bryant|
|3.7593964397052786|    Carmelo Anthony|
|3.4551935311516107|  Dominique Wilkins|
| 4.018363200323627|Kareem Abdul-Jabbar|
|3.3815943577991505|         Larry Bird|
| 4.891784788570858|     Adrian Dantley|
|  3.76769811668317|      Pete Maravich|
| 5.008927755946713|   Shaquille O'Neal|
| 4.539163991028245|        Dwyane Wade|
+------------------+-------------------+
only showing top

In [8]:
schemaPeople.sort("distance",ascending=True).show()

+------------------+-------------------+
|          distance|               name|
+------------------+-------------------+
|               0.0|     Michael Jordan|
| 2.393377221590684|       LeBron James|
|  2.45087960369704|         Jerry West|
| 2.452747469988465|       Elgin Baylor|
| 2.930094812635345|   Wilt Chamberlain|
|3.0837258294085528|      George Gervin|
|3.1476311918721507|         Bob Pettit|
|3.2314020864897763|      Allen Iverson|
|3.2527939293064425|       Kevin Durant|
|3.3815943577991505|         Larry Bird|
|3.4551935311516107|  Dominique Wilkins|
|  3.59662892670863|    Oscar Robertson|
|3.7593964397052786|    Carmelo Anthony|
|  3.76769811668317|      Pete Maravich|
| 3.983627421640038|        Karl Malone|
| 4.005454854441199|        Kobe Bryant|
| 4.018363200323627|Kareem Abdul-Jabbar|
| 4.108338577435676|         Rick Barry|
| 4.539163991028245|        Dwyane Wade|
| 4.891784788570858|     Adrian Dantley|
+------------------+-------------------+
only showing top