In [1]:
#Import frameworks
import pandas as pd
import numpy as np
from sklearn import linear_model
from matplotlib import pyplot as plt

In [2]:
#Load the data
data = pd.read_csv('data.csv')

In [4]:
data.head()

Unnamed: 0,yearID,name,R,innings,BA,OBP,SLG,ISO,TA,LinWeights
0,1951,Boston Red Sox,804,1386,0.2655,0.356048,0.3916,0.1261,0.592546,128.78
1,1951,Brooklyn Dodgers,855,1422,0.2751,0.346842,0.4345,0.1593,0.560849,161.84
2,1951,Boston Braves,723,1395,0.2617,0.332878,0.3935,0.1319,0.551497,72.63
3,1951,Chicago White Sox,714,1395,0.2702,0.342986,0.3845,0.1144,0.587985,66.05
4,1951,Chicago Cubs,614,1395,0.25,0.311895,0.3637,0.1136,0.505736,-35.13


In [25]:
#Get averages for each year and place in new dataframe
league_avg = []
for year in data['yearID'].unique():
    year_avg = data[data['yearID']==year].mean()
    league_avg.append(year_avg)
data_avg = pd.concat(league_avg,axis=1)
data_avg = data_avg.transpose()
data_avg.head()

Unnamed: 0,yearID,R,innings,BA,OBP,SLG,ISO,TA,LinWeights
0,1951,704.25,1393.875,0.260819,0.333139,0.385394,0.124581,0.549212,50.603125
1,1952,646.8125,1393.875,0.252638,0.323097,0.369581,0.116944,0.523321,-4.025625
2,1953,714.125,1395.0,0.2642,0.332383,0.397156,0.132963,0.544413,65.166875
3,1954,676.6875,1391.625,0.260937,0.332634,0.389981,0.12905,0.546985,55.074375
4,1955,691.8125,1388.25,0.258481,0.330753,0.39395,0.13545,0.532121,56.62125


In [81]:
#Generate Y data and reshape to (NUM,1)
Y = list(data['R'])
Y = np.array(Y).reshape((len(Y),1))



    

In [128]:
def generate_X(col_name):
    X_STAT = []
    for i in range(0,len(data)):
        team_stat = data[col_name][i]
        league_stat = data_avg[data_avg['yearID']==data['yearID'][i]][col_name]
        innings = data['innings'][i]
        league_runs = data_avg[data_avg['yearID']==data['yearID'][i]]['R']
        league_innings = data_avg[data_avg['yearID']==data['yearID'][i]]['innings']
        league_runs_over_innings = league_runs/league_innings
        stat = float((team_stat/league_stat)*(innings*league_runs_over_innings))
        X_STAT.append(stat)
    X_STAT = np.array(X_STAT).reshape((len(X_STAT),1))
    return X_STAT
    

In [131]:
X_BA = generate_X('BA')
X_OBP = generate_X('OBP')
X_SLG = generate_X('SLG')
X_ISO = generate_X('ISO')
X_TA = generate_X('TA')
X_LW = generate_X('LinWeights')

In [146]:
def regression_errors(X,Y):
    reg = linear_model.LinearRegression()
    reg.fit(X,Y)
    errors = regression.predict(X)-Y
    return errors
    

In [147]:
errors_BA = regression_errors(X_BA,Y)
errors_OBP = regression_errors(X_OBP,Y)
errors_SLG = regression_errors(X_SLG,Y)
errors_ISO = regression_errors(X_ISO,Y)
errors_TA = regression_errors(X_TA,Y)
errors_LW = regression_errors(X_LW,Y)

In [154]:
#plt.hist(errors_LW,50,normed=1,facecolor='blue',alpha=.5)
plt.hist(errors_IS,50,normed=1,facecolor='red',alpha=.5)

plt.show()

In [151]:
print('Batting Average STD: {0}'.format(errors_BA.std()))
print('On Base Percentage STD: {0}'.format(errors_OBP.std()))
print('Slugging Percentage STD: {0}'.format(errors_SLG.std()))
print('Isolated Power STD: {0}'.format(errors_ISO.std()))
print('Total Average STD: {0}'.format(errors_TA.std()))
print('Linear Weights STD: {0}'.format(errors_LW.std()))

Batting Average STD: 50.5811115328
On Base Percentage STD: 48.3657765256
Slugging Percentage STD: 36.9237061258
Isolated Power STD: 68.1446100477
Total Average STD: 49.2590684697
Linear Weights STD: 7455.26039352
