In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
quantbruce_real_estate_price_prediction_path = kagglehub.dataset_download('quantbruce/real-estate-price-prediction')

print('Data source import complete.')


# **Polynomial Regression**

# **Review**

**It is better to see the results of Linear Regression on this Dataset first. To do this, I invite you to refer to my previous notebook about [Linear Regression](https://www.kaggle.com/aminizahra/linear-regression) .**



* we just completed a Linear Regression task, allowing us to predict future label values given a set of features!


* How can we now improve on a Linear Regression model?


* One approach is to consider **higher order relationships** on the features.


* There are two issues polynomial Regression will address for us:
  * Non-linear feature relationships to label

  * Interaction terms between features
  

* Let's first explore non-linear relationships and how considering polynomial orders could help address this.


* Imagine a feature that is not linear:
<a href="https://uupload.ir/" target="_blank"><img src="https://s4.uupload.ir/files/1_eqiy.jpg" border="0" alt="آپلود عکس" /></a>


* We know log(x) is not a linear relationship.

* Will be difficult to find a linear relationship:


<a href="https://uupload.ir/" target="_blank"><img src="https://s4.uupload.ir/files/3_qylp.jpg" border="0" alt="آپلود عکس" /></a>

* What about the square of this feature?


<a href="https://uupload.ir/" target="_blank"><img src="https://s4.uupload.ir/files/4_8ov.jpg" border="0" alt="آپلود عکس" /></a>

* Even more so for higher orders!


<a href="https://uupload.ir/" target="_blank"><img src="https://s4.uupload.ir/files/5_rop2.jpg" border="0" alt="آپلود عکس" /></a>

* Let's now also consider interaction terms.


* What if features are only significant when in sync with one another?


* For example:
  * perhaps newspaper advertising spend by itself is not effective, but greatly increases effectiveness if added to a TV advertising campaign.


* Consumers only watching a TV ad will create some sales, but Consumers who watch TV and are later "reminded" through a newspaper ad could contribute even more sales than TV or newspaper alone!


* How can we check for this?


* Simplest way is to create a new feature that multiplies two existing features together to create an interaction term.


* We can keep the original features, and add on this interaction term.


* Fortunately Scikit-Learn does this for us easily through a preprocessing call.


* Sickit-Learn's preprocessing library contains many useful tools to apply to the original data set before model training.


* One tool is the PolynomialFeatures which automatically creates both higher order feature Polynomials and the interaction terms between all feature combinations.


* Converting Two Features A and B:
  * $$1,A,B,A^2, AB, B^2$$


* Generalized terms of features X1 and X2:
  * $$1,X_1, X_2, X_1^2, X_1X_2, X_2^2$$
  

# Expanding Simple Linear Regression

<a href="https://uupload.ir/" target="_blank"><img src="https://s4.uupload.ir/files/6_0hty.jpg" border="0" alt="آپلود عکس" /></a>

# **📤 Import all necessary Libraries**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn import metrics

from joblib import dump, load

%matplotlib inline

# **💾 Check out the Data**

In [None]:
df= pd.read_csv('../input/real-estate-price-prediction/Real estate.csv')

In [None]:
df.head()

In [None]:
print("The Dataset has",df.shape[0],"Rows")

print("The Dataset has",df.shape[1],"Columns")

In [None]:
df.info()

In [None]:
df.corr()

In [None]:
sns.heatmap(df.corr(), annot=True,cmap="YlGnBu")

**Hint:** You can do more overview by yourself

# **📊 Exploratory Data Analysis (EDA)**

In [None]:
sns.pairplot(df)

## **Define X and y**

In [None]:
X=df.drop('Y house price of unit area', axis=1)

y=df['Y house price of unit area']

# **✔️ Preprocessing**

In [None]:
polynomial_converter=PolynomialFeatures(degree=2, include_bias=False)

In [None]:
poly_features=polynomial_converter.fit(X)

In [None]:
poly_features=polynomial_converter.transform(X)

In [None]:
poly_features.shape

In [None]:
X.shape

# **🧱 Train Test Split**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    poly_features, y, test_size=0.3, random_state=101)

# **📈 Training a Polynomial Regression Model**

In [None]:
polymodel=LinearRegression()

In [None]:
polymodel.fit(X_train, y_train)

 # **✔️  Predicting Test Data**

In [None]:
y_pred=polymodel.predict(X_test)

In [None]:
pd.DataFrame({'Y_Test': y_test,'Y_Pred':y_pred, 'Residuals':(y_test-y_pred) }).head(5)

# **✔️ Evaluating the Model**

In [None]:
MAE_Poly = metrics.mean_absolute_error(y_test,y_pred)
MSE_Poly = metrics.mean_squared_error(y_test,y_pred)
RMSE_Poly = np.sqrt(MSE_Poly)

pd.DataFrame([MAE_Poly, MSE_Poly, RMSE_Poly],
             index=['MAE', 'MSE', 'RMSE'], columns=['metrics'])

# **✔️ Compare to the simple linear regression:**

In [None]:
XS_train, XS_test, ys_train, ys_test = train_test_split(X, y, test_size=0.3, random_state=101)
simplemodel=LinearRegression()
simplemodel.fit(XS_train, ys_train)
ys_pred=simplemodel.predict(XS_test)

MAE_simple = metrics.mean_absolute_error(ys_test,ys_pred)
MSE_simple = metrics.mean_squared_error(ys_test,ys_pred)
RMSE_simple = np.sqrt(MSE_simple)

In [None]:
pd.DataFrame({'Poly Metrics': [MAE_Poly, MSE_Poly, RMSE_Poly], 'Simple Metrics':[MAE_simple, MSE_simple,
                                                                                 RMSE_simple]}, index=['MAE', 'MSE', 'RMSE'])

# **✔️ Adjusting Model Parameters**

In [None]:
# Train List of RMSE per degree
train_RMSE_list=[]
#Test List of RMSE per degree
test_RMSE_list=[]

for d in range(1,10):

    #Preprocessing
    #create poly data set for degree (d)
    polynomial_converter= PolynomialFeatures(degree=d, include_bias=False)
    poly_features= polynomial_converter.fit(X)
    poly_features= polynomial_converter.transform(X)

    #Split the dataset
    X_train, X_test, y_train, y_test = train_test_split(poly_features, y, test_size=0.3, random_state=101)

    #Train the Model
    polymodel=LinearRegression()
    polymodel.fit(X_train, y_train)

    #Predicting on both Train & Test Data
    y_train_pred=polymodel.predict(X_train)
    y_test_pred=polymodel.predict(X_test)

    #Evaluating the Model

    #RMSE of Train set
    train_RMSE=np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))

    #RMSE of Test Set
    test_RMSE=np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))

    #Append the RMSE to the Train and Test List

    train_RMSE_list.append(train_RMSE)
    test_RMSE_list.append(test_RMSE)

In [None]:
pd.DataFrame({'Train RMSE List':train_RMSE_list})

In [None]:
pd.DataFrame({'Ttest RMSE List':test_RMSE_list})

# **✔️ Plot the Polynomial degree VS RMSE**

In [None]:
plt.plot(range(1,6), train_RMSE_list[:5], label='Train RMSE')
plt.plot(range(1,6), test_RMSE_list[:5], label='Test RMSE')

plt.xlabel('Polynomial Degree')
plt.ylabel('RMSE')
plt.legend()