# R vs Python: head to head data analysis - Python

## Read in a csv file

In [6]:
import pandas as pd
nba = pd.read_csv("nba_2013.csv")

## Find the number of players

In [2]:
nba.shape

(481, 31)

## Look at the first two row of the data

In [4]:
nba.head(2)

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,144,216,28,23,26,30,122,171,2013-2014,2013
1,Steven Adams,C,20,OKC,81,20,1197,93,185,0.503,...,190,332,43,40,57,71,203,265,2013-2014,2013


## Find the average of each statistic

In [5]:
nba.mean()

age             26.509356
g               53.253638
gs              25.571726
mp            1237.386694
fg             192.881497
fga            424.463617
fg.              0.436436
x3p             39.613306
x3pa           110.130977
x3p.             0.285111
x2p            153.268191
x2pa           314.332640
x2p.             0.466947
efg.             0.480752
ft              91.205821
fta            120.642412
ft.              0.722419
orb             55.810811
drb            162.817048
trb            218.627859
ast            112.536383
stl             39.280665
blk             24.103950
tov             71.862786
pf             105.869023
pts            516.582121
season_end    2013.000000
dtype: float64

## Make pairwise scatterplots

In [14]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.pairplot(nba[["ast", "fg", "trb"]])
plt.show()

  if self._edgecolors == str('face'):


## Make clusters of the players
One good way to explore this kind of data is to generate cluster plots. These will show which players are most similar.

In [15]:
from sklearn.cluster import KMeans
kmeans_model = KMeans(n_clusters=5, random_state=1)
good_columns = nba._get_numeric_data().dropna(axis=1)
kmeans_model.fit(good_columns)
labels = kmeans_model.labels_

## Plot players by cluster

In [16]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
pca_2 = PCA(2)
plot_columns = pca_2.fit_transform(good_columns)
plt.scatter(x=plot_columns[:,0], y=plot_columns[:,1], c=labels)
plt.show()

## Split into training and testing sets

In [17]:
trainRowCount <- floor(0.8 * nrow(nba))
set.seed(1)
trainIndex <- sample(1:nrow(nba), trainRowCount)
train <- nba[trainIndex,]
test <- nba[-trainIndex,]

SyntaxError: invalid syntax (<ipython-input-17-51795431ff72>, line 3)

## Univariate linear regression

In [17]:
fit <- lm(ast ~ fg, data=train)
predictions <- predict(fit, test)

## Calculate summary statistics for the model

In [18]:
summary(fit)


Call:
lm(formula = ast ~ fg, data = train)

Residuals:
    Min      1Q  Median      3Q     Max 
-228.26  -35.38  -11.45   11.99  559.61 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   9.0585     7.1026   1.275    0.203    
fg            0.5307     0.0274  19.368   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 92.89 on 382 degrees of freedom
Multiple R-squared:  0.4954,	Adjusted R-squared:  0.4941 
F-statistic: 375.1 on 1 and 382 DF,  p-value: < 2.2e-16


## Fit a random forest model

In [13]:
library(randomForest)
predictorColumns <- c("age", "mp", "fg", "trb", "stl", "blk")
rf <- randomForest(train[predictorColumns], train$ast, ntree=100)
predictions <- predict(rf, test[predictorColumns])

SyntaxError: invalid syntax (<ipython-input-13-395b87b415c8>, line 3)

# Download a webpage

In [1]:
import requests
url = "http://www.basketball-reference.com/boxscores/201506140GSW.html"
data = requests.get(url).content

## Extract player box scores

In [10]:
from bs4 import BeautifulSoup
import re
soup = BeautifulSoup(data, 'html.parser')
box_scores = []
for tag in soup.find_all(id=re.compile("[A-Z]{3,}_basic")):
    rows = []
    for i, row in enumerate(tag.find_all("tr")):
        if i == 0:
            continue
        elif i == 1:
            tag = "th"
        else:
            tag = "td"
        row_data = [item.get_text() for item in row.find_all(tag)]
        rows.append(row_data)
    box_scores.append(rows)

In [11]:
type(box_scores)

list

In [12]:
box_scores

[[[u'Starters',
   u'MP',
   u'FG',
   u'FGA',
   u'FG%',
   u'3P',
   u'3PA',
   u'3P%',
   u'FT',
   u'FTA',
   u'FT%',
   u'ORB',
   u'DRB',
   u'TRB',
   u'AST',
   u'STL',
   u'BLK',
   u'TOV',
   u'PF',
   u'PTS',
   u'+/-'],
  [u'LeBron James',
   u'44:46',
   u'15',
   u'34',
   u'.441',
   u'3',
   u'8',
   u'.375',
   u'7',
   u'9',
   u'.778',
   u'1',
   u'13',
   u'14',
   u'11',
   u'1',
   u'0',
   u'2',
   u'5',
   u'40',
   u'-11'],
  [u'Matthew Dellavedova',
   u'41:41',
   u'2',
   u'9',
   u'.222',
   u'1',
   u'5',
   u'.200',
   u'0',
   u'0',
   u'',
   u'0',
   u'0',
   u'0',
   u'2',
   u'3',
   u'0',
   u'2',
   u'4',
   u'5',
   u'-19'],
  [u'Tristan Thompson',
   u'39:49',
   u'6',
   u'11',
   u'.545',
   u'0',
   u'0',
   u'',
   u'7',
   u'10',
   u'.700',
   u'5',
   u'5',
   u'10',
   u'0',
   u'2',
   u'2',
   u'1',
   u'4',
   u'19',
   u'-22'],
  [u'Iman Shumpert',
   u'37:18',
   u'3',
   u'9',
   u'.333',
   u'3',
   u'6',
   u'.500',
   u'1',
   u