In [82]:
from itertools import combinations
import pandas as pd
from pathlib import Path
from sklearn import linear_model, pipeline, compose, preprocessing, metrics
import time

In [2]:
datafolder = Path('~/github/islr2/data').expanduser()

In [3]:
dfhitters_raw = pd.read_csv(datafolder / 'Hitters.csv')
dfhitters_raw

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
0,293,66,1,30,29,14,1,293,66,1,30,29,14,A,E,446,33,20,,A
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317,497,127,7,65,48,37,5,2703,806,32,379,311,138,N,E,325,9,3,700.0,N
318,492,136,5,76,50,94,12,5511,1511,39,897,451,875,A,E,313,381,20,875.0,A
319,475,126,3,61,43,52,6,1700,433,7,217,93,146,A,W,37,113,7,385.0,A
320,573,144,9,85,60,78,8,3198,857,97,470,420,332,A,E,1314,131,12,960.0,A


In [4]:
dfhitters = (
    dfhitters_raw
    .loc[lambda df: pd.notna(df['Salary'])]
    .copy()
)
dfhitters

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,A,W,282,421,25,750.0,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317,497,127,7,65,48,37,5,2703,806,32,379,311,138,N,E,325,9,3,700.0,N
318,492,136,5,76,50,94,12,5511,1511,39,897,451,875,A,E,313,381,20,875.0,A
319,475,126,3,61,43,52,6,1700,433,7,217,93,146,A,W,37,113,7,385.0,A
320,573,144,9,85,60,78,8,3198,857,97,470,420,332,A,E,1314,131,12,960.0,A


In [50]:
dfhitters_rhs = dfhitters.drop(columns='Salary')
dfhitters_rhs

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,NewLeague
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,N
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,A,W,282,421,25,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317,497,127,7,65,48,37,5,2703,806,32,379,311,138,N,E,325,9,3,N
318,492,136,5,76,50,94,12,5511,1511,39,897,451,875,A,E,313,381,20,A
319,475,126,3,61,43,52,6,1700,433,7,217,93,146,A,W,37,113,7,A
320,573,144,9,85,60,78,8,3198,857,97,470,420,332,A,E,1314,131,12,A


In [51]:
def subsets_r2(columns):
    X = dfhitters[columns]
    y = dfhitters['Salary']
    lm = linear_model.LinearRegression()
    ct = compose.ColumnTransformer([
        ('onehot', preprocessing.OneHotEncoder(), lambda df: df.select_dtypes('object').columns),
        ('passthrough', 'passthrough', lambda df: df.select_dtypes('number').columns)
    ])
    pipe = pipeline.Pipeline([('transformer', ct), ('regressor', lm)])
    pipe.fit(X, y)
    return metrics.r2_score(y, pipe.predict(X))

In [61]:
def subsets_formula(columns):
    return f'Salary ~ {" + ".join(columns)}'

In [66]:
def subsets_all(num_vars):
    columns = [list(x) for x in combinations(dfhitters_rhs.columns, num_vars)]
    return pd.DataFrame({
        'num_vars': num_vars,
        'formula': [subsets_formula(x) for x in columns],
        'r2': [subsets_r2(x) for x in columns]
    })

In [94]:
start = time.time()
dfsubsets = pd.concat([subsets_all(x) for x in range(1, 5)], ignore_index=True)
display(dfsubsets.loc[dfsubsets.groupby('num_vars')['r2'].idxmax()])
end = time.time()
end - start

Unnamed: 0,num_vars,formula,r2
11,1,Salary ~ CRBI,0.32145
46,2,Salary ~ Hits + CRBI,0.425224
454,3,Salary ~ Hits + CRBI + PutOuts,0.451429


4.486625671386719