<a href="https://colab.research.google.com/github/vothane/sabermatrix/blob/main/sabermatrix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark

In [7]:
from bs4 import BeautifulSoup
import requests

In [8]:
class Table:
    def __init__(self):
        self.rows = []

    @property
    def get_rows(self):
        return self.rows

    def insert(self, row_values):
        self.rows.append(row_values)

class TableBuilder:
    @staticmethod
    def build_table(url):
        html_doc = requests.get(url)
        html_content = BeautifulSoup(html_doc.content, 'html.parser')
        
        raw = html_content.find('thead')
        data = raw.find_all('tr')
        rows = data[1:]

        table = Table()

        convert = lambda txt: float(txt) if txt.replace('.', '', 1).isdigit() else txt
        
        for cols in rows:
            col = cols.find_all("td")
            table.insert([convert(txt.text) for txt in col])

        return table        

In [9]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# define schema for our data
schema = StructType([
   StructField("Pitch", StringType(), False),
   StructField("Count", FloatType(), False),
   StructField("Foul/Swing", FloatType(), False),
   StructField("Whiff/Swing", FloatType(), False),
   StructField("GB/BIP", FloatType(), False),
   StructField("LD/BIP", FloatType(), False),
   StructField("FB/BIP", FloatType(), False),
   StructField("PU/BIP", FloatType(), False),
   StructField("GB/FB", FloatType(), False),
   StructField("HR/(FB+LD)", FloatType(), False)])

url = "http://www.brooksbaseball.net/tabs.php?player=456034&var=so"
table = TableBuilder.build_table(url)

data = table.get_rows
print(data)

[['Fourseam', 8724.0, 44.45, 24.19, 37.76, 21.86, 31.0, 9.38, 121.79, 7.07], ['Sinker', 11843.0, 43.11, 16.27, 45.82, 21.56, 25.91, 6.71, 176.84, 7.4], ['Change', 5694.0, 30.21, 31.14, 45.1, 24.92, 22.99, 6.99, 196.15, 6.71], ['Slider', 648.0, 41.28, 21.14, 58.04, 16.07, 16.96, 8.93, 342.11, 21.62], ['Curve', 2860.0, 35.36, 26.41, 48.33, 20.71, 24.72, 6.24, 195.5, 10.29], ['Cutter', 5055.0, 41.55, 21.24, 44.5, 24.31, 24.77, 6.42, 179.63, 8.64]]


In [10]:
from pyspark.sql.functions import col

spark = (SparkSession.builder.appName("Big_Leagues").getOrCreate())

sabermetrics = spark.createDataFrame(data, schema)
 
# show David Price most used pitches in descending order
(sabermetrics.select("Pitch")
             .where(col("Count") > 5000)
             .orderBy(desc("Count"))).show()

+--------+
|   Pitch|
+--------+
|  Sinker|
|Fourseam|
|  Change|
|  Cutter|
+--------+

