In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import neighbors
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
import xgboost
from xgboost import XGBRegressor

In [15]:
import warnings 
warnings.filterwarnings('ignore')

In [16]:
df = pd.read_csv("Hitters.csv")
df = df.dropna()
dms = pd.get_dummies(df[["League", "Division", "NewLeague"]])

In [23]:
def compML(df, y, alg):
    # Train-Test Splitting
    y = df[y]
    X_ = df.drop(["Salary", "League", "Division", "NewLeague"], axis=1).astype("float64")
    X = pd.concat([X_, dms[["League_N", "Division_W", "NewLeague_N"]]], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=42)
    
    # Modelling
    model = alg().fit(X_train, y_train)
    y_pred = model.predict(X_test)
    RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
    model_name = alg.__name__
    print(model_name, ":", RMSE)

In [24]:
compML(df, "Salary", LGBMRegressor)

LGBMRegressor : 363.8712087611089


In [25]:
compML(df, "Salary", SVR)

SVR : 460.0032657244849


In [26]:
models = [LGBMRegressor,
          XGBRegressor,
          GradientBoostingRegressor,
          RandomForestRegressor,
          DecisionTreeRegressor,
          MLPRegressor,
          KNeighborsRegressor,
          SVR]

In [28]:
for i in models:
    compML(df, "Salary", i)

LGBMRegressor : 363.8712087611089
XGBRegressor : 355.46515176059927
GradientBoostingRegressor : 350.8837559811925
RandomForestRegressor : 346.7253071126708
DecisionTreeRegressor : 468.0624480038897
MLPRegressor : 371.97387148453714
KNeighborsRegressor : 426.6570764525201
SVR : 460.0032657244849
