In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import model_selection
import matplotlib.pyplot as plt
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.impute import KNNImputer
import missingno as msno
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=FutureWarning)
from sklearn.linear_model import LinearRegression

## Data Understanding

In [None]:
hitters = pd.read_csv("../input/regularization-of-hitters/Hitters.csv")

In [None]:
df = hitters.copy()

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df[df["Hits"] < 70]

In [None]:
df[df["Hits"] > 70]

In [None]:
df[df["Salary"].isnull()].head(10)

In [None]:
sns.lineplot(x = "Salary",y = "Years",data= df,hue = "League",style = "Division")

In [None]:
df.describe([0.01,0.25,0.75,0.99]).T

In [None]:
f, ax = plt.subplots(figsize= [20,15])
sns.heatmap(df.corr(), annot=True, fmt=".2f", ax=ax, cmap = "coolwarm" )
ax.set_title("Correlation Matrix", fontsize=20)
plt.show()

In [None]:
df.groupby(["League"])["Salary"].mean()

In [None]:
df["Salary"].mean()

# Data Preparation

In [None]:
msno.matrix(df);

In [None]:
for i in ["Hits","HmRun","Runs","RBI","Walks","Years","CAtBat","CHits","CHmRun","CRuns","CRBI","CWalks","PutOuts","Assists","Errors","Salary","AtBat"]:

    Q1 = df[i].quantile(0.25)
    Q3 = df[i].quantile(0.75)
    IQR = Q3-Q1
    upper = Q3 + 1.5*IQR
    lower = Q1 - 1.5*IQR

    if df[(df[i] > upper) | (df[i] < lower)].any(axis=None):
        print(i,"yes")
        print(df[(df[i] > upper) | (df[i] < lower)].shape[0])
    else:
        print(i, "no")


# One hot encoding with pd lib

In [None]:
df = pd.get_dummies(df, columns =["League","Division","NewLeague"], drop_first = True)

In [None]:
cols = df.columns

# Missing values

In [None]:
imputer = KNNImputer(n_neighbors=6)
df_filled = imputer.fit_transform(df)

# Outliers 

In [None]:
from sklearn.neighbors import LocalOutlierFactor

In [None]:
clf =LocalOutlierFactor(n_neighbors= 20,contamination= 0.1)

In [None]:
clf.fit_predict(df_filled)

In [None]:
df_scores = clf.negative_outlier_factor_

In [None]:
np.sort(df_scores)[0:30]

In [None]:
th = np.sort(df_scores)[8]
th

In [None]:
outlier = df_scores > th

In [None]:
dff = df_filled[df_scores > th]

In [None]:
dff = pd.DataFrame(dff,columns = cols)

# PREDICTION

# Linear Regression

In [None]:

from sklearn.preprocessing import StandardScaler,MinMaxScaler,RobustScaler
scaler = StandardScaler()
y = dff["Salary"]
X = dff.drop('Salary', axis=1)


In [None]:
df_ = pd.DataFrame(dff, columns = cols)

In [None]:
y.head()

In [None]:
X.head()

In [None]:
dummies = dff[["League_N","Division_W","NewLeague_N"]]
dummies

In [None]:
X = X.drop(["League_N","Division_W","NewLeague_N"],axis = 1)

In [None]:
X.head()

In [None]:
cols = X.columns

In [None]:
X = scaler.fit_transform(X)

In [None]:
X = pd.DataFrame(X, columns = cols)

In [None]:
X.head()

In [None]:
X_ = pd.concat([X,dummies],axis = 1)

In [None]:
X_

In [None]:
y.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_, 
                                                    y, 
                                                    test_size = 0.20, random_state = 46)

In [None]:
X_.head()

In [None]:
reg_model = LinearRegression()
reg_model.fit(X_train, y_train)

In [None]:
#train
y_pred = reg_model.predict(X_train)
np.sqrt(mean_squared_error(y_train, y_pred))

In [None]:
#test
y_pred = reg_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

### Tuned Linear R

In [None]:
linear_tuned = np.sqrt(np.mean(-cross_val_score(reg_model, X_train, y_train, cv = 10, scoring = "neg_mean_squared_error")))

In [None]:
y_pred = reg_model.predict(X_test)

In [None]:
linear_sc = np.sqrt(mean_squared_error(y_test, y_pred))

## Ridge Regression

In [None]:
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.linear_model import Ridge, Lasso

In [None]:
ridge_model = Ridge().fit(X_train,y_train)

In [None]:
## train
y_pred = ridge_model.predict(X_train)
np.sqrt(mean_squared_error(y_train, y_pred))

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import model_selection
import matplotlib.pyplot as plt
from sklearn.linear_model import RidgeCV, LassoCV

In [None]:
#test
y_pred = ridge_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))


In [None]:
alphas = [0.0001, 0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 1]

In [None]:
ridge_cv = RidgeCV(alphas = alphas, scoring = "neg_mean_squared_error", cv = 10,normalize = True).fit(X_train,y_train)

In [None]:
ridge_cv.alpha_

# Tuned Ridge

In [None]:
ridge_tuned = Ridge(alpha =ridge_cv.alpha_).fit(X_train,y_train)

In [None]:
#test
y_pred = ridge_tuned.predict(X_test)
ridge_sc = np.sqrt(mean_squared_error(y_test, y_pred))
ridge_sc

# Lasso Regression

In [None]:
from sklearn.linear_model import Lasso

In [None]:
alphas = [0.0001, 0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 1]

In [None]:
lasso_model = Lasso().fit(X_train, y_train)

In [None]:
#train
y_pred  = lasso_model.predict(X_train)
np.sqrt(mean_squared_error(y_train,y_pred))

In [None]:
#test
y_pred  = lasso_model.predict(X_test)
np.sqrt(mean_squared_error(y_test,y_pred))

In [None]:
lasso_cv_model = LassoCV(alphas = alphas, cv = 10,max_iter = 10000).fit(X_train, y_train)

In [None]:
lasso_cv_model.alpha_

# Tuned Lasso

In [None]:
lasso_tuned = Lasso(alpha =lasso_cv_model.alpha_).fit(X_train,y_train)

In [None]:
y_pred = lasso_tuned.predict(X_test)

In [None]:
lasso_sc =np.sqrt(mean_squared_error(y_test,y_pred))
lasso_sc

# ElasticNET

In [None]:
from sklearn.linear_model import RidgeCV, LassoCV,ElasticNetCV
from sklearn.linear_model import ElasticNet

In [None]:
enet_model = ElasticNet().fit(X_train, y_train)
y_pred = enet_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
enet_params = {"l1_ratio": [0.001,0.01,0.1,0.4,0.5,0.6,0.8,1],
              "alpha":[0.1,0.01,0.001,0.2,0.3,0.5,0.8,0.9,1],
              "max_iter" :[1000,5000,10000]}

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
gs_cv_enet = GridSearchCV(enet_model, enet_params, cv = 10,n_jobs = -1).fit(X_train, y_train)

In [None]:
gs_cv_enet.best_params_

# Tuned ElasticNET

In [None]:
enet_tuned = ElasticNet(**gs_cv_enet.best_params_,normalize = True)

In [None]:
enet_tuned = ElasticNet().fit(X_train, y_train)
y_pred = enet_tuned.predict(X_test)
enet_sc = np.sqrt(mean_squared_error(y_test, y_pred))
enet_sc

# Reporting

In [None]:
models = pd.DataFrame({"Model" : ["Linear","Rigde","Lasso","ElasticNET"],
                     "Score" : [linear_sc,ridge_sc,lasso_sc,enet_sc]})

In [None]:
models

In [None]:
plt.plot(models["Model"],models["Score"], 'ro');
plt.title("Regularization")
plt.xlabel("Models")
plt.ylabel("Scores")
plt.show()