## **Working with Linear Regression**

## Step 1: Data Preparation

- Import the required libraries
- Load the Boston housing data set
- Prepare the data set


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets

In [None]:
from sklearn.datasets import fetch_openml

boston = fetch_openml(name='boston', version=1, as_frame=True, parser='auto')
data = boston.data
target = boston.target
feature_names = boston.feature_names

## Step 2: Create a DataFrame and Check for Missing Values

- Create a DataFrame using the Boston housing data
- Display basic statistics
- Check for missing values


In [None]:
print(boston.DESCR)

In [None]:
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['HousePrice'] = boston.target
df.head()

__Observation__:

- This is the head of the data set.

In [None]:
df.describe()

__Observation__:
- Here, you can see the statistical analysis of the data set.

In [None]:
df.isna().sum()

__Observation__:

- There are no empty rows in the dataset.

## Step 3: Remove Outliers from the HousePrice Column


- Use a boxplot to visualize the outliers
- Remove outliers from the __HousePrice__ column using the 1% and 99% quantiles


In [None]:
sns.boxplot(df['HousePrice'])

__Observation__:

- There are outliers in the data set.

In [None]:
upper_limit = df['HousePrice'].quantile(0.99)
lower_limit = df['HousePrice'].quantile(0.01)

In [None]:
x = np.random.randint(0,100,(5,2))
x

In [None]:
np.where(x<30,99,x)

In [None]:
lower_limit

In [None]:
upper_limit

In [None]:
df['HousePrice'] = np.where(df['HousePrice'] < lower_limit, lower_limit, df['HousePrice'])
df['HousePrice'] = np.where(df['HousePrice'] > upper_limit, upper_limit, df['HousePrice'])

In [None]:
df

# Implement the Linear Regression Model

In [None]:
df.shape

In [None]:
# separate features and target

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns="HousePrice"), df['HousePrice'],
                                                 test_size=0.2, random_state=0)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
from sklearn.linear_model import LinearRegression
linear = LinearRegression()

In [None]:
linear.fit(X_train,y_train)

In [None]:
linear.coef_

In [None]:
linear.intercept_

In [None]:
y = "y="
for i,j in zip(X_train.columns,linear.coef_):
    y+= (i+ "*"+str(j)+"+")
y += str(linear.intercept_)

In [None]:
print(y)

## Step 4: Test for Linearity of the Model

- Fit the model and display the summary
- Define functions to calculate residuals and plot actual vs. predicted values
- Test for linearity


In [None]:
import statsmodels.api as sm
import numpy as np
import pandas as pd

boston.data = boston.data.apply(pd.to_numeric)

X_constant = sm.add_constant(np.asarray(boston.data))
boston_model = sm.OLS(boston.target, np.asarray(boston.data)).fit()
boston_model.summary()

- Define a function to calculate residual values by taking the actual and predicted values
- The value of residuals is equal to the difference between the actual and  predicted values.  

In [None]:
def calculate_residuals(model, features, label):
    predictions =  model.predict(features)
    df_results = pd.DataFrame({'Actual' : label, 'Predicted' : predictions})
    df_results['Residuals'] = abs(df_results['Actual']) - abs(df_results['Predicted'])
    return df_results

- Next, define a function to plot the actual and predicted values using __lmplot__.
- The orange line will show the fitted line created by the linear regression model.

In [None]:
def linear_assumptions(model, features, label):
    df_results = calculate_residuals(model, features, label)

    sns.lmplot(x='Actual', y='Predicted', data=df_results, fit_reg=False, height=7)
    line_coords = np.arange(df_results.min().min(), df_results.max().max())
    plt.plot(line_coords, line_coords, color='darkorange', linestyle='--')
    plt.title('Actual vs. Predicted')
    plt.show()

- Now, run the function __linear_assumptions__ to show the graph with the model as __boston_model__, features as __boston.data__, and label as __boston.taget__ variables as arguments.

In [None]:
linear_assumptions(boston_model, boston.data, boston.target)

__Observation__:

- We can observe that the line does not represent all the data points.

## Step 5: Check for Multicollinearity

- Let's check the correlation between the variables in the data set.



In [None]:
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

- Calculate the variance inflation factor (VIF) for each feature
- Import the __variance_inflation_factor__ module from the __statsmodels.stats.outliers_influence__ library
- Set the features as the DataFrame, except the target variable
- Assign the __vif_data__ to the feature columns

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd
import numpy as np

x = df.drop(['HousePrice'], axis=1)
x = x.astype(float)  # Convert the array to float type

vif_data = pd.DataFrame()
vif_data['Features'] = x.columns

vif_data['vif'] = [variance_inflation_factor(x.values, i) for i in range(len(x.columns))]
print(vif_data)

__Observation__:
- From the above output, we can infer that the columns **NOX, RM, AGE,** and **PTRATIO** have higher multicollinearity. Hence, we can drop them. 

## Step 6: Remove Multicollinear Features and Split the Data set

- Remove highly multicollinear features from the data set


In [None]:
df1 = df.drop(['NOX', 'RM', 'AGE', 'PTRATIO'], axis = 1)

- Now, set the feature and target variables.

In [None]:
x = df1.drop(['HousePrice'], axis =1)
y = df1['HousePrice']

- Next, split the data into train and test sets.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test  = train_test_split(x, y, random_state=0, test_size=0.25)


## Step 7: Fit the Model and Evaluate Performance

- Fit the model using OLS and display the summary.


In [None]:
X_train = pd.DataFrame(X_train)
y_train = pd.DataFrame(y_train)
X_test = pd.DataFrame(X_test) 

X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')

X_train = np.asarray(X_train)
y_train = np.asarray(y_train)
X_test = np.asarray(X_test) 

print(X_train.dtype)
print(y_train.dtype)
print(np.isnan(X_train).sum())
print(np.isnan(y_train).sum())

model = sm.OLS(y_train, X_train).fit()

print(model.summary())

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn import linear_model

- Now, fit the model using linear regression.


In [None]:
reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)
y_pred_train = reg.predict(X_train)
y_pred_test = reg.predict(X_test)

X_test = pd.DataFrame(X_test)  # Convert X_test to a pandas DataFrame
X_test = X_test.apply(pd.to_numeric, errors='coerce')  # Convert non-numeric values to NaN
X_test = np.asarray(X_test)  # Convert X_test to a numpy array

y_pred_test = reg.predict(X_test)  # Predict using the trained model

- Evaluate the model using various metrics such as the **r2_score, mean_absolute_error, and mean_squared_error**

In [None]:
print("R Square: {}".format(r2_score(y_train, y_pred_train)))
print("MAE: {}".format(mean_absolute_error(y_train, y_pred_train)))
print("MSE: {}".format(mean_squared_error(y_train, y_pred_train)))

__Observation__:

- From the above output, we can observe that the model is a moderate fit for the given data.

In [None]:
print("R Square: {}".format(r2_score(y_test, y_pred_test)))
print("MAE: {}".format(mean_absolute_error(y_test, y_pred_test)))
print("MSE: {}".format(mean_squared_error(y_test, y_pred_test)))

__Observation__:

- The model moderately explains the testing data, as indicated by the **R** **Square** value. 

# SGD

In [None]:
from sklearn.linear_model import SGDRegressor
sgd = SGDRegressor()

sgd.fit(X_train.astype("float"),y_train)

y_pred_test = sgd.predict(X_test.astype("float"))

r2_score(y_test,y_pred_test)

## Regularization

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from sklearn.metrics import r2_score

X, y = make_regression(n_samples=100, n_features=20, noise=10)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train models (adjust hyperparameters as needed)
models = {
    'Linear Regression': LinearRegression(),
    'Lasso': Lasso(alpha=0.1),  # Tune alpha for Lasso
    'Ridge': Ridge(alpha=1.0),  # Tune alpha for Ridge
    'Elastic Net': ElasticNet(alpha=0.5, l1_ratio=0.7),  # Tune alpha and l1_ratio for Elastic Net
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"r2 score is {r2_score(y_test,y_pred)}")

In [None]:
import pandas as pd
pd.DataFrame(X).describe()