In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from pathlib import Path

In [2]:
MAIN_DIR = Path(".").absolute()
BASEBALL_DIR = MAIN_DIR / "baseball_data"
POLIT_DIR = MAIN_DIR / "political_data"

In [3]:
teams_df = pd.read_csv(BASEBALL_DIR / "core" / "Teams.csv")
players_df = pd.read_csv(BASEBALL_DIR / "core" / "People.csv")
batting_df = pd.read_csv(BASEBALL_DIR / "core" / "Batting.csv")
pitching_df = pd.read_csv(BASEBALL_DIR / "core" / "Pitching.csv")
appearances_df = pd.read_csv(BASEBALL_DIR / "core" / "Appearances.csv")
batting_advanced_df = pd.read_csv(BASEBALL_DIR / 'advanced' / 'batting_advanced.csv')
pitching_advanced_df = pd.read_csv(BASEBALL_DIR / 'advanced' / 'pitching_advanced.csv')

In [4]:
batting_df = batting_df[batting_df["yearID"] >= 1899]

In [5]:
batting_df

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
7566,anderjo01,1899,1,BRO,NL,117,439,65,118,18,...,92.0,25.0,,27,24.0,,4.0,2.0,,
7567,atherch01,1899,1,WAS,NL,65,242,28,60,5,...,23.0,2.0,,21,26.0,,2.0,4.0,,
7568,baileha01,1899,1,BSN,NL,12,34,3,8,2,...,2.0,0.0,,2,3.0,,1.0,0.0,,
7569,bakerki01,1899,1,WAS,NL,12,19,1,3,0,...,1.0,0.0,,1,6.0,,0.0,0.0,,
7570,barreji01,1899,1,CIN,NL,26,92,30,34,2,...,10.0,4.0,,18,7.0,,1.0,3.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110490,zimmejo02,2021,1,MIL,NL,2,1,0,0,0,...,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0
110491,zimmeky01,2021,1,KCA,AL,52,0,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
110492,zimmery01,2021,1,WAS,NL,110,255,27,62,16,...,46.0,0.0,0.0,16,77.0,0.0,0.0,0.0,2.0,9.0
110493,zuberty01,2021,1,KCA,AL,31,1,0,0,0,...,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0


In [6]:
teams_df["minAB"] = 3.1 * teams_df['G']

In [7]:
batting_heavy_min = pd.merge(batting_df, teams_df[['minAB', 'yearID', 'teamID']], on=['yearID', 'teamID'])

In [8]:
batting_heavy_min["PA"] = batting_heavy_min[["AB", "HBP", "BB", "SF", "SH"]].sum(axis=1, skipna=True)

In [9]:
batting_heavy_min = batting_heavy_min[batting_heavy_min["minAB"] < batting_heavy_min["PA"]]

In [10]:
batting_heavy_data = pd.merge(batting_heavy_min, players_df, on=["playerID"])
batting_heavy_data["fullName"] = batting_heavy_data["nameFirst"] + " " + batting_heavy_data["nameLast"]

In [11]:
batting_heavy_advanced = pd.merge(batting_heavy_data,batting_advanced_df.drop("G", axis=1), on=['yearID','bbrefID'])

In [12]:
batting_heavy_advanced_final = batting_heavy_advanced[['playerID', 'bbrefID', 'yearID', 'fullName', 'teamID', 'birthState', 'birthCountry', 'G','AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'HBP', 'SH', 'SF', 'WAR', 'oWAR', 'dWAR']]

In [13]:
batting_heavy_advanced_final

Unnamed: 0,playerID,bbrefID,yearID,fullName,teamID,birthState,birthCountry,G,AB,R,...,SB,CS,BB,SO,HBP,SH,SF,WAR,oWAR,dWAR
0,anderjo01,anderjo01,1899,John Anderson,BRO,,Norway,117,439,65,...,25.0,,27,24.0,4.0,2.0,,0.6,0.4,-0.2
1,anderjo01,anderjo01,1901,John Anderson,MLA,,Norway,138,576,90,...,35.0,,24,21.0,3.0,4.0,,4.4,4.3,-0.2
2,anderjo01,anderjo01,1902,John Anderson,SLA,,Norway,126,524,60,...,15.0,,21,9.0,3.0,3.0,,0.9,1.3,-0.6
3,anderjo01,anderjo01,1903,John Anderson,SLA,,Norway,138,550,65,...,16.0,,23,39.0,0.0,4.0,,3.3,2.5,0.4
4,anderjo01,anderjo01,1904,John Anderson,NYA,,Norway,143,558,62,...,20.0,,23,37.0,6.0,11.0,,3.2,2.8,-0.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12251,lowena01,lowena01,2021,Nathaniel Lowe,TEX,VA,USA,157,557,75,...,8.0,0.0,80,162.0,2.0,0.0,3.0,2.4,2.6,-1.1
12252,francty01,francty01,2021,Ty France,SEA,CA,USA,152,571,85,...,0.0,0.0,46,106.0,27.0,0.0,6.0,4.3,4.1,-0.5
12253,hernaen02,hernaen02,2021,Enrique Hernandez,BOS,,P.R.,134,508,84,...,1.0,0.0,61,110.0,9.0,0.0,7.0,4.9,3.2,2.1
12254,renfrhu01,renfrhu01,2021,Hunter Renfroe,BOS,MS,USA,144,521,89,...,1.0,2.0,44,130.0,1.0,0.0,6.0,2.3,2.3,-0.5


In [14]:
corrMatrix = batting_heavy_advanced_final.corr()
px.imshow(corrMatrix, text_auto=True, aspect='auto', zmax=1, zmin=-1, color_continuous_scale=px.colors.diverging.Fall)

Pitch

In [15]:
pitching_df = pitching_df[pitching_df["yearID"] >= 1899]
teams_df["minOUT"] = teams_df["G"] * 3
pitching_min = pd.merge(pitching_df, teams_df[['minOUT', 'yearID', 'teamID']], on=['yearID', 'teamID'])
pitching_min = pitching_min[pitching_min["minOUT"] < pitching_min["IPouts"]]

In [16]:
pitching_full = pd.merge(pitching_min, players_df, on=["playerID"])
pitching_full["fullName"] = pitching_full["nameFirst"] + " " + pitching_full["nameLast"]

In [17]:
pitching_full

Unnamed: 0,playerID,yearID,stint,teamID,lgID,W,L,G,GS,CG,...,nameGiven,weight,height,bats,throws,debut,finalGame,retroID,bbrefID,fullName
0,lewiste01,1899,1,BSN,NL,17,11,29,25,23,...,Edward Morgan,158.0,70.0,R,R,1896-07-06,1901-09-26,lewit101,lewiste01,Ted Lewis
1,lewiste01,1900,1,BSN,NL,13,12,30,22,19,...,Edward Morgan,158.0,70.0,R,R,1896-07-06,1901-09-26,lewit101,lewiste01,Ted Lewis
2,lewiste01,1901,1,BOS,AL,16,17,39,34,31,...,Edward Morgan,158.0,70.0,R,R,1896-07-06,1901-09-26,lewit101,lewiste01,Ted Lewis
3,nichoki01,1899,1,BSN,NL,21,19,42,37,37,...,Charles Augustus,175.0,70.0,B,R,1890-04-23,1906-05-18,nichk101,nichoki01,Kid Nichols
4,nichoki01,1900,1,BSN,NL,13,16,29,27,25,...,Charles Augustus,175.0,70.0,B,R,1890-04-23,1906-05-18,nichk101,nichoki01,Kid Nichols
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8547,uriasju01,2021,1,LAN,NL,20,3,32,32,0,...,Julio Cesar,225.0,72.0,L,L,2016-05-27,2021-10-02,uriaj001,uriasju01,Julio Urias
8548,lylesjo01,2021,1,TEX,AL,10,13,32,30,0,...,Jordan Horton,230.0,77.0,R,R,2011-05-31,2021-10-02,lylej001,lylesjo01,Jordan Lyles
8549,burneco01,2021,1,MIL,NL,11,5,28,28,0,...,Corbin Brian,225.0,75.0,R,R,2018-07-10,2021-10-02,burnc002,burneco01,Corbin Burnes
8550,flexech01,2021,1,SEA,AL,14,6,31,31,0,...,Christopher John,250.0,75.0,R,R,2017-07-27,2021-10-02,flexc001,flexech01,Chris Flexen


In [18]:
pitching_full_advanced = pd.merge(pitching_full,pitching_advanced_df.drop(['G','GS'], axis=1), on=['yearID','bbrefID'])
pitching_full_advanced

Unnamed: 0,playerID,yearID,stint,teamID,lgID,W,L,G,GS,CG,...,WAA,gmLI,WAAadj,WAR,RAR,waaWL%,162WL%,Salary,Awards,RA9extras
0,lewiste01,1899,1,BSN,NL,17,11,29,25,23,...,1.6,1.00,-0.2,3.6,37.0,0.555,0.510,,,
1,lewiste01,1900,1,BSN,NL,13,12,30,22,19,...,0.8,1.00,-0.1,2.7,28.0,0.527,0.505,,,
2,lewiste01,1901,1,BOS,AL,16,17,39,34,31,...,0.1,1.00,-0.2,3.2,40.0,0.504,0.501,,,
3,nichoki01,1899,1,BSN,NL,21,19,42,37,37,...,4.3,1.00,-0.4,7.3,74.0,0.603,0.527,"$3,000",,
4,nichoki01,1900,1,BSN,NL,13,16,29,27,25,...,2.8,1.00,-0.2,4.8,50.0,0.596,0.517,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8733,uriasju01,2021,1,LAN,NL,20,3,32,32,0,...,2.8,,-0.2,4.4,43.0,0.588,0.517,"$3,600,000",CYA-7,0.0
8734,lylesjo01,2021,1,TEX,AL,10,13,32,30,0,...,-1.6,0.55,-0.1,-0.1,2.0,0.449,0.490,"$8,000,000",,0.0
8735,burneco01,2021,1,MIL,NL,11,5,28,28,0,...,4.1,,-0.1,5.6,52.0,0.648,0.526,,"AS,CYA-1,MVP-15",0.0
8736,flexech01,2021,1,SEA,AL,14,6,31,31,0,...,1.9,,-0.1,3.5,37.0,0.562,0.512,"$1,700,000",,0.0


In [19]:
pitching_full_advanced_final = pitching_full_advanced[['playerID', 'bbrefID', 'yearID', 'fullName', 'teamID', 'birthState', 'birthCountry', 'W', 'GS', 'G', 'SV', 'ER', 'SO', 'BB', 'ERA', 'BAOpp', 'WP', 'HR', 'gmLI', 'WAR']]

In [20]:
pitching_full_advanced_final

Unnamed: 0,playerID,bbrefID,yearID,fullName,teamID,birthState,birthCountry,W,GS,G,SV,ER,SO,BB,ERA,BAOpp,WP,HR,gmLI,WAR
0,lewiste01,lewiste01,1899,Ted Lewis,BSN,Powys,United Kingdom,17,25,29,0,91,60,73,3.49,,3,10,1.00,3.6
1,lewiste01,lewiste01,1900,Ted Lewis,BSN,Powys,United Kingdom,13,22,30,0,96,66,86,4.13,,8,11,1.00,2.7
2,lewiste01,lewiste01,1901,Ted Lewis,BOS,Powys,United Kingdom,16,34,39,1,124,103,91,3.53,,2,14,1.00,3.2
3,nichoki01,nichoki01,1899,Kid Nichols,BSN,WI,USA,21,37,42,1,114,108,82,2.99,,8,11,1.00,7.3
4,nichoki01,nichoki01,1900,Kid Nichols,BSN,WI,USA,13,27,29,0,79,53,72,3.07,,6,11,1.00,4.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8733,uriasju01,uriasju01,2021,Julio Urias,LAN,Sinaloa,Mexico,20,32,32,0,61,195,38,2.96,0.219,2,19,,4.4
8734,lylesjo01,lylesjo01,2021,Jordan Lyles,TEX,SC,USA,10,30,32,0,103,146,56,5.15,0.278,9,38,0.55,-0.1
8735,burneco01,burneco01,2021,Corbin Burnes,MIL,CA,USA,11,28,28,0,45,234,34,2.43,0.201,5,7,,5.6
8736,flexech01,flexech01,2021,Chris Flexen,SEA,CA,USA,14,31,31,0,72,125,40,3.61,0.268,2,19,,3.5


In [21]:
corrMatrix = pitching_full_advanced_final.corr()
px.imshow(corrMatrix, text_auto=True, aspect='auto', zmax=1, zmin=-1, color_continuous_scale=px.colors.diverging.Fall)

In [22]:
from sklearn.preprocessing import StandardScaler

In [23]:
batting_heavy_advanced_main = batting_heavy_advanced_final.iloc[:,7:25]
batting_heavy_advanced_main

Unnamed: 0,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,HBP,SH,SF,WAR,oWAR,dWAR
0,117,439,65,118,18,7,4,92.0,25.0,,27,24.0,4.0,2.0,,0.6,0.4,-0.2
1,138,576,90,190,46,7,8,99.0,35.0,,24,21.0,3.0,4.0,,4.4,4.3,-0.2
2,126,524,60,149,29,6,4,85.0,15.0,,21,9.0,3.0,3.0,,0.9,1.3,-0.6
3,138,550,65,156,34,8,2,78.0,16.0,,23,39.0,0.0,4.0,,3.3,2.5,0.4
4,143,558,62,155,27,12,3,82.0,20.0,,23,37.0,6.0,11.0,,3.2,2.8,-0.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12251,157,557,75,147,24,3,18,72.0,8.0,0.0,80,162.0,2.0,0.0,3.0,2.4,2.6,-1.1
12252,152,571,85,166,32,1,18,73.0,0.0,0.0,46,106.0,27.0,0.0,6.0,4.3,4.1,-0.5
12253,134,508,84,127,35,3,20,60.0,1.0,0.0,61,110.0,9.0,0.0,7.0,4.9,3.2,2.1
12254,144,521,89,135,33,0,31,96.0,1.0,2.0,44,130.0,1.0,0.0,6.0,2.3,2.3,-0.5


In [24]:
batting_heavy_advanced_main = batting_heavy_advanced_main.fillna(value = 0,axis=0)
batting_heavy_advanced_main

Unnamed: 0,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,HBP,SH,SF,WAR,oWAR,dWAR
0,117,439,65,118,18,7,4,92.0,25.0,0.0,27,24.0,4.0,2.0,0.0,0.6,0.4,-0.2
1,138,576,90,190,46,7,8,99.0,35.0,0.0,24,21.0,3.0,4.0,0.0,4.4,4.3,-0.2
2,126,524,60,149,29,6,4,85.0,15.0,0.0,21,9.0,3.0,3.0,0.0,0.9,1.3,-0.6
3,138,550,65,156,34,8,2,78.0,16.0,0.0,23,39.0,0.0,4.0,0.0,3.3,2.5,0.4
4,143,558,62,155,27,12,3,82.0,20.0,0.0,23,37.0,6.0,11.0,0.0,3.2,2.8,-0.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12251,157,557,75,147,24,3,18,72.0,8.0,0.0,80,162.0,2.0,0.0,3.0,2.4,2.6,-1.1
12252,152,571,85,166,32,1,18,73.0,0.0,0.0,46,106.0,27.0,0.0,6.0,4.3,4.1,-0.5
12253,134,508,84,127,35,3,20,60.0,1.0,0.0,61,110.0,9.0,0.0,7.0,4.9,3.2,2.1
12254,144,521,89,135,33,0,31,96.0,1.0,2.0,44,130.0,1.0,0.0,6.0,2.3,2.3,-0.5


In [25]:
scaler = StandardScaler()
batting_heavy_advanced_main_scale = pd.DataFrame(scaler.fit_transform(batting_heavy_advanced_main),columns=['G','AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'HBP', 'SH', 'SF', 'WAR', 'oWAR', 'dWAR'])
batting_heavy_advanced_main_scale.describe()

Unnamed: 0,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,HBP,SH,SF,WAR,oWAR,dWAR
count,12256.0,12256.0,12256.0,12256.0,12256.0,12256.0,12256.0,12256.0,12256.0,12256.0,12256.0,12256.0,12256.0,12256.0,12256.0,12256.0,12256.0,12256.0
mean,-6.09898e-16,5.600394e-16,-3.061085e-16,-6.957011e-18,5.565609e-17,3.7104060000000004e-17,5.565609e-17,1.391402e-16,-6.49321e-17,6.02941e-17,-1.669683e-16,-2.226244e-16,1.8552030000000002e-17,3.7104060000000004e-17,1.113122e-16,-1.321832e-16,-2.052318e-16,0.0
std,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041,1.000041
min,-6.195505,-5.361025,-2.815868,-4.097632,-2.713199,-1.274061,-1.237173,-2.389256,-0.9563396,-0.9079894,-2.119339,-1.906789,-1.095839,-0.8464502,-0.983651,-3.167905,-3.076999,-4.60533
25%,-0.4682473,-0.5725492,-0.7011651,-0.6360881,-0.742268,-0.7774752,-0.8840822,-0.7437895,-0.7259217,-0.9079894,-0.7347114,-0.7759837,-0.5518917,-0.7181469,-0.983651,-0.7075366,-0.6800278,-0.682819
50%,0.2317508,0.08107063,-0.07636654,-0.02942582,-0.04664538,-0.2808894,-0.1778997,-0.07776737,-0.3418918,-0.2800632,-0.1289368,-0.1416297,-0.279918,-0.3332369,-0.006194175,-0.0696633,-0.1190346,-0.072651
75%,0.6772042,0.6920631,0.6445549,0.6486085,0.6489772,0.4639892,0.6165557,0.6666103,0.4261678,0.5571718,0.563377,0.6306274,0.2640292,0.4365831,0.6454437,0.6137723,0.5949567,0.624684
max,1.440839,2.624504,4.777838,4.003094,4.706776,7.664483,5.206742,4.701921,9.028436,7.882978,7.702863,4.243687,12.50284,7.749873,5.206909,5.078885,4.827905,4.634362


In [26]:
from sklearn.cluster import KMeans

In [27]:
import plotly.graph_objects as go
inertia = []
for i in range(1,11):
    kmeans = KMeans(
        n_clusters=i, init="k-means++",
        n_init=10,
        tol=1e-04, random_state=42
    )
    kmeans.fit(batting_heavy_advanced_main_scale)
    inertia.append(kmeans.inertia_)

In [28]:
fig = go.Figure(data=go.Scatter(x=np.arange(1,11),y=inertia))
fig.update_layout(title="Inertia vs Cluster Number",xaxis=dict(range=[0,11],title="Cluster Number"),
                  yaxis={'title':'Inertia'},
                 annotations=[
        dict(
            x=3,
            y=inertia[2],
            xref="x",
            yref="y",
            text="Elbow!",
            showarrow=True,
            arrowhead=7,
            ax=20,
            ay=-40
        )
    ])

In [29]:
kmeans = KMeans(
        n_clusters= 4, init="k-means++",
        n_init=10,
        tol=1e-04, random_state=42
    )

In [30]:
kmeans.fit(batting_heavy_advanced_main_scale)
batting_heavy_advanced_main_scale['label']=kmeans.labels_
polar=batting_heavy_advanced_main_scale.groupby("label").mean().reset_index()
polar=pd.melt(polar,id_vars=["label"])
fig4 = px.line_polar(polar, r="value", theta="variable", color="label", line_close=True,height=800,width=1200)
fig4.show()


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [31]:
pie=batting_heavy_advanced_main_scale.groupby('label').size().reset_index()
pie.columns=['label','value']
px.pie(pie,values='value',names='label')

In [32]:
batting_heavy_advanced_main_scale.drop('label',axis = 1)

Unnamed: 0,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,HBP,SH,SF,WAR,oWAR,dWAR
0,-1.613699,-1.311424,-0.605042,-1.135692,-0.974142,0.463989,-0.884082,0.823321,0.963810,-0.907989,-1.167407,-1.244854,-0.007944,-0.589844,-0.983651,-1.117598,-1.292020,-0.246985
1,-0.277339,0.635227,0.596493,1.433701,2.272097,0.463989,-0.530991,1.097566,1.731869,-0.907989,-1.297216,-1.327596,-0.279918,-0.333237,-0.983651,0.613772,0.696955,-0.246985
2,-1.040973,-0.103648,-0.845349,-0.029426,0.301166,0.215696,-0.884082,0.549077,0.195750,-0.907989,-1.427025,-1.658563,-0.279918,-0.461540,-0.983651,-0.980911,-0.833026,-0.595652
3,-0.277339,0.265789,-0.605042,0.220376,0.880851,0.712282,-1.060628,0.274833,0.272556,-0.907989,-1.340486,-0.831145,-1.095839,-0.333237,-0.983651,0.112586,-0.221033,0.276017
4,0.040842,0.379462,-0.749227,0.184690,0.069292,1.705454,-0.972355,0.431544,0.579780,-0.907989,-1.340486,-0.886306,0.536003,0.564886,-0.983651,0.067024,-0.068035,-0.334152
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12251,0.931749,0.365253,-0.124428,-0.100798,-0.278520,-0.529182,0.351737,0.039766,-0.341892,-0.907989,1.125882,2.561270,-0.551892,-0.846450,-0.006194,-0.297475,-0.170034,-1.031487
12252,0.613568,0.564181,0.356186,0.577236,0.648977,-1.025768,0.351737,0.078944,-0.956340,-0.907989,-0.345285,1.016756,6.247450,-0.846450,0.971263,0.568210,0.594957,-0.508486
12253,-0.531884,-0.330994,0.308125,-0.814518,0.996788,-0.529182,0.528283,-0.430367,-0.879534,-0.907989,0.303759,1.127078,1.351924,-0.846450,1.297082,0.841584,0.135962,1.757854
12254,0.104478,-0.146275,0.548432,-0.529030,0.764914,-1.274061,1.499284,0.980033,-0.879534,-0.489372,-0.431824,1.678691,-0.823865,-0.846450,0.971263,-0.343038,-0.323032,-0.508486


In [35]:
from sklearn.decomposition import PCA
# import seaborn as sn
pca_num_components = 2

reduced_data = PCA(n_components=pca_num_components).fit_transform(batting_heavy_advanced_main_scale.drop('label',axis = 1))
results = pd.DataFrame(reduced_data,columns=['pca1','pca2'])

In [36]:
px.scatter(results,x="pca1", y="pca2", color=batting_heavy_advanced_main_scale['label'],hover_name=batting_heavy_advanced_final['fullName'])

In [37]:
from sklearn.cluster import DBSCAN
cluster = DBSCAN(eps=5, min_samples=2).fit(batting_heavy_advanced_main_scale)
batting_heavy_advanced_main_scale['label'] = cluster.labels_
batting_heavy_advanced_main_scale

Unnamed: 0,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,HBP,SH,SF,WAR,oWAR,dWAR,label
0,-1.613699,-1.311424,-0.605042,-1.135692,-0.974142,0.463989,-0.884082,0.823321,0.963810,-0.907989,-1.167407,-1.244854,-0.007944,-0.589844,-0.983651,-1.117598,-1.292020,-0.246985,0
1,-0.277339,0.635227,0.596493,1.433701,2.272097,0.463989,-0.530991,1.097566,1.731869,-0.907989,-1.297216,-1.327596,-0.279918,-0.333237,-0.983651,0.613772,0.696955,-0.246985,0
2,-1.040973,-0.103648,-0.845349,-0.029426,0.301166,0.215696,-0.884082,0.549077,0.195750,-0.907989,-1.427025,-1.658563,-0.279918,-0.461540,-0.983651,-0.980911,-0.833026,-0.595652,0
3,-0.277339,0.265789,-0.605042,0.220376,0.880851,0.712282,-1.060628,0.274833,0.272556,-0.907989,-1.340486,-0.831145,-1.095839,-0.333237,-0.983651,0.112586,-0.221033,0.276017,0
4,0.040842,0.379462,-0.749227,0.184690,0.069292,1.705454,-0.972355,0.431544,0.579780,-0.907989,-1.340486,-0.886306,0.536003,0.564886,-0.983651,0.067024,-0.068035,-0.334152,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12251,0.931749,0.365253,-0.124428,-0.100798,-0.278520,-0.529182,0.351737,0.039766,-0.341892,-0.907989,1.125882,2.561270,-0.551892,-0.846450,-0.006194,-0.297475,-0.170034,-1.031487,0
12252,0.613568,0.564181,0.356186,0.577236,0.648977,-1.025768,0.351737,0.078944,-0.956340,-0.907989,-0.345285,1.016756,6.247450,-0.846450,0.971263,0.568210,0.594957,-0.508486,0
12253,-0.531884,-0.330994,0.308125,-0.814518,0.996788,-0.529182,0.528283,-0.430367,-0.879534,-0.907989,0.303759,1.127078,1.351924,-0.846450,1.297082,0.841584,0.135962,1.757854,0
12254,0.104478,-0.146275,0.548432,-0.529030,0.764914,-1.274061,1.499284,0.980033,-0.879534,-0.489372,-0.431824,1.678691,-0.823865,-0.846450,0.971263,-0.343038,-0.323032,-0.508486,0


In [38]:
pca_num_components = 2

reduced_data = PCA(n_components=pca_num_components).fit_transform(batting_heavy_advanced_main_scale.drop('label',axis = 1))
results = pd.DataFrame(reduced_data,columns=['pca1','pca2'])

In [39]:
px.scatter(results,x="pca1", y="pca2", color=batting_heavy_advanced_main_scale['label'],hover_name=batting_heavy_advanced_final['fullName'])

Pitch


In [40]:
pca_num_components = 2

reduced_data = PCA(n_components=pca_num_components).fit_transform(batting_heavy_advanced_main_scale.drop('label',axis = 1))
results = pd.DataFrame(reduced_data,columns=['pca1','pca2'])

In [41]:
px.scatter(results,x="pca1", y="pca2", color=batting_heavy_advanced_main_scale['label'],hover_name=batting_heavy_advanced_final['fullName'])

Pitch