In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

In [2]:
from sklearn.datasets import load_boston
boston_dataset = load_boston()
dataset = pd.DataFrame(boston_dataset.data, columns = boston_dataset.feature_names)

In [3]:
boston_dataset.feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [4]:
type(boston_dataset.target)

numpy.ndarray

In [5]:
X = dataset
y = boston_dataset.target

In [6]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 25)

In [7]:
print("Shape of X_train: ",X_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_train: ",y_train.shape)
print("Shape of y_test",y_test.shape)

Shape of X_train:  (354, 13)
Shape of X_test:  (152, 13)
Shape of y_train:  (354,)
Shape of y_test (152,)


In [8]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

In [9]:
prepross_steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
]

In [10]:
prepross_pipe = Pipeline(prepross_steps)

In [11]:
X_train_processed = prepross_pipe.fit_transform(X_train)

In [12]:
X_train_processed.shape

(354, 105)

In [13]:
model = Ridge(alpha=3.8, fit_intercept=True)
model.fit(X_train_processed, y_train)

Ridge(alpha=3.8)

In [14]:
from sklearn.metrics import r2_score

In [15]:
# Predicting R2 Score the Test set results
X_test_processed = prepross_pipe.transform(X_test)
y_pred = model.predict(X_test_processed)
r2_score_ridge_test = r2_score(y_test, y_pred)

# Predicting RMSE the Test set results
rmse_ridge = (np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2_score (test): ', r2_score_ridge_test)
print("RMSE: ", rmse_ridge)

R2_score (test):  0.8919654552985888
RMSE:  2.853062166443758


In [16]:
X_test['target']=y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['target']=y_test


In [17]:
X_test.to_csv('test_data.csv', index=False)

In [18]:
import joblib

In [19]:
joblib.dump(prepross_pipe, 'input_transformer.pkl')
joblib.dump(model, 'model.pkl')

['model.pkl']

In [20]:
prepross_pipe.transform(X_test.iloc[:,:13])

array([[ 1.        , -0.40602392, -0.50338975, ...,  0.0931984 ,
         0.08275914,  0.07348919],
       [ 1.        ,  0.22231925, -0.50338975, ...,  0.15443881,
         0.11550182,  0.08638159],
       [ 1.        , -0.38644682, -0.50338975, ...,  0.19474902,
        -0.3951179 ,  0.80163768],
       ...,
       [ 1.        , -0.41481182, -0.50338975, ...,  0.18085842,
        -0.21408315,  0.25341144],
       [ 1.        , -0.42291595,  2.98028035, ...,  0.19474902,
        -0.41585008,  0.88797   ],
       [ 1.        ,  0.71957428, -0.50338975, ..., 10.06033954,
        -2.93745764,  0.85769047]])

In [21]:
import sklearn
sklearn.__version__

'0.24.1'

In [22]:
np.__version__

'1.19.1'

In [23]:
import scipy

In [24]:
scipy.__version__

'1.4.1'

In [25]:
loaded_model = joblib.load('model.pkl')

In [26]:
y_pred = loaded_model.predict(X_test_processed)