In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SequentialFeatureSelector 
import matplotlib.pyplot as plt
import numpy as np

In [None]:
df = pd.read_csv('framingham.csv')
df.head()

In [None]:
plt.scatter(df['CareerHmRuns'],df['Salary'])

In [None]:
# Apply one-hot encoding to categorical variables
df = pd.get_dummies(df, columns=['League','Division','NewLeague'],drop_first=True)
df.head()

In [None]:
# 70/30 split into train and test sets
features = [
 'AtBats',
 'Hits',
 'HmRuns',
 'Runs',
 'RBIs',
 'Walks',
 'Years',
 'CareerAtBats',
 'CareerHits',
 'CareerHmRuns',
 'CareerRuns',
 'CareerRBIs',
 'CareerWalks',
 'PutOuts',
 'Assists',
 'Errors',
 'League_N',
 'Division_E',
 'Division_W',
 'NewLeague_N']
X = df[features]
y = df['Salary']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.3,random_state=21)

In [None]:
# Ordinary Linear Regression
reg = LinearRegression()
reg.fit(X_train,y_train)
print(r2_score(y_train,reg.predict(X_train)))
print(r2_score(y_test,reg.predict(X_test)))
print(reg.coef_[9])
print(reg.coef_[18]-reg.coef_[17])
print(-reg.coef_[18])
print(reg.coef_[17])

In [None]:
# Ridge
alphas = np.exp(np.arange(-3,3,.1))
reg = RidgeCV(alphas=alphas,cv=10,normalize=True)
reg.fit(X_train,y_train)
print(r2_score(y_train,reg.predict(X_train)))
print(r2_score(y_test,reg.predict(X_test)))
print(reg.alpha_)
print(reg.coef_[9])

In [None]:
# Lasso
alphas = np.exp(np.arange(-3,3,.1))
reg = LassoCV(alphas=alphas,cv=10,normalize=True,max_iter=10000)
reg.fit(X_train,y_train)
print(r2_score(y_train,reg.predict(X_train)))
print(r2_score(y_test,reg.predict(X_test)))
print(reg.alpha_)
print(reg.coef_)

In [None]:
# Forward Selection
reg = LinearRegression()
sfs = SequentialFeatureSelector(reg, n_features_to_select=15, direction='backward')
sfs.fit(X_train,y_train)
sfs.support_

In [None]:
# Fitting ordinary linear regression on the selected features
reg = LinearRegression()
reg.fit(sfs.transform(X_train),y_train)
print(r2_score(y_train,reg.predict(sfs.transform(X_train))))
print(r2_score(y_test,reg.predict(sfs.transform(X_test))))