In [1]:
import pandas as pd
import os
os.chdir("C:/Training/Academy/Statistics (Python)/Datasets")
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

In [2]:
boston = pd.read_csv("Boston.csv")
boston.shape

(506, 14)

In [3]:
train, test = train_test_split(boston, test_size=0.3, random_state=23)
train.shape, test.shape

((354, 14), (152, 14))

In [4]:
train.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
481,5.70818,0.0,18.1,0,0.532,6.75,74.9,3.3317,24,666,20.2,393.07,7.74,23.7
97,0.12083,0.0,2.89,0,0.445,8.069,76.0,3.4952,2,276,18.0,396.9,4.21,38.7
51,0.04337,21.0,5.64,0,0.439,6.115,63.0,6.8147,4,243,16.8,393.97,9.43,20.5
293,0.08265,0.0,13.92,0,0.437,6.127,18.4,5.5027,4,289,16.0,396.9,8.58,23.9
27,0.95577,0.0,8.14,0,0.538,6.047,88.8,4.4534,4,307,21.0,306.38,17.28,14.8


In [5]:
test.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
176,0.07022,0.0,4.05,0,0.51,6.02,47.2,3.5549,5,296,16.6,393.23,10.11,23.2
311,0.79041,0.0,9.9,0,0.544,6.122,52.8,2.6403,4,304,18.4,396.9,5.98,22.1
94,0.04294,28.0,15.04,0,0.464,6.249,77.3,3.615,4,270,18.2,396.9,10.59,20.6
139,0.54452,0.0,21.89,0,0.624,6.151,97.9,1.6687,4,437,21.2,396.9,18.46,17.8
232,0.57529,0.0,6.2,0,0.507,8.337,73.3,3.8384,8,307,17.4,385.91,2.47,41.7


In [6]:
X_train = train.drop('medv', axis=1)
y_train = train['medv']
X_test = test.drop('medv', axis=1)
y_test = test['medv']

degree = 1

In [7]:
lr = LinearRegression()
lr.fit(X_train, y_train)
ycap = lr.predict(X_test)
print(r2_score(y_test, ycap))

0.6947991644651343


degree = 2

In [8]:
poly = PolynomialFeatures(degree=2)
X_poly_trn = poly.fit_transform(X_train)
X_poly_trn.shape

(354, 105)

In [9]:
lr.fit(X_poly_trn, y_train)
X_poly_tst = poly.transform(X_test)
ycap = lr.predict(X_poly_tst)
print(r2_score(y_test, ycap))

0.42226997558888557


### Using Pipelines

In [10]:
from sklearn.pipeline import Pipeline

In [11]:
poly = PolynomialFeatures(degree=2)
lr = LinearRegression()
pipe = Pipeline([('POLY',poly), ('LR', lr)])
pipe.fit(X_train, y_train)
ycap = pipe.predict(X_test)
print(r2_score(y_test, ycap))

0.42226997558888557


In [12]:
poly = PolynomialFeatures(degree=3)
lr = LinearRegression()
pipe = Pipeline([('POLY',poly), ('LR', lr)])
pipe.fit(X_train, y_train)
ycap = pipe.predict(X_test)
print(r2_score(y_test, ycap))

-1821.658004486805


## Housing Dataset

In [13]:
housing = pd.read_csv("Housing.csv")
dum_hous = pd.get_dummies(housing, drop_first=True)
X = dum_hous.drop('price', axis=1)
y = dum_hous['price']

In [14]:
train, test = train_test_split(dum_hous, test_size=0.3, random_state=23)
train.shape, test.shape

((382, 12), (164, 12))

In [15]:
X_train = train.drop('price', axis=1)
y_train = train['price']
X_test = test.drop('price', axis=1)
y_test = test['price']

In [16]:
lr = LinearRegression()
lr.fit(X_train, y_train)
ycap = lr.predict(X_test)
print(r2_score(y_test, ycap))

0.6543071090954233


In [17]:
poly = PolynomialFeatures(degree=2)
lr = LinearRegression()
pipe = Pipeline([('POLY',poly), ('LR', lr)])
pipe.fit(X_train, y_train)
ycap = pipe.predict(X_test)
print(r2_score(y_test, ycap))

0.4634636634889163


In [18]:
poly = PolynomialFeatures(degree=3)
lr = LinearRegression()
pipe = Pipeline([('POLY',poly), ('LR', lr)])
pipe.fit(X_train, y_train)
ycap = pipe.predict(X_test)
print(r2_score(y_test, ycap))

-5.351577818627877


## Concrete Strength

In [19]:
concrete = pd.read_csv(r"C:\Training\Academy\Statistics (Python)\Cases\Concrete Strength\Concrete_Data.csv")
concrete.columns

Index(['Cement', 'Blast', 'Fly', 'Water', 'Superplasticizer', 'Coarse', 'Fine',
       'Age', 'Strength'],
      dtype='object')

In [20]:
X = concrete.drop('Strength', axis=1)
y = concrete['Strength']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    random_state=23)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((721, 8), (309, 8), (721,), (309,))

In [22]:
lr = LinearRegression()
lr.fit(X_train, y_train)
ycap = lr.predict(X_test)
print(r2_score(y_test, ycap))

0.6312960386440597


In [23]:
poly = PolynomialFeatures(degree=2)
lr = LinearRegression()
pipe = Pipeline([('POLY',poly), ('LR', lr)])
pipe.fit(X_train, y_train)
ycap = pipe.predict(X_test)
print(r2_score(y_test, ycap))

0.7855626359650658


In [24]:
poly = PolynomialFeatures(degree=3)
lr = LinearRegression()
pipe = Pipeline([('POLY',poly), ('LR', lr)])
pipe.fit(X_train, y_train)
ycap = pipe.predict(X_test)
print(r2_score(y_test, ycap))

0.836883959593107


In [25]:
poly = PolynomialFeatures(degree=4)
lr = LinearRegression()
pipe = Pipeline([('POLY',poly), ('LR', lr)])
pipe.fit(X_train, y_train)
ycap = pipe.predict(X_test)
print(r2_score(y_test, ycap))

-55.52023036179425


#### Unlabelled Data

In [26]:
tst = pd.read_csv(r"C:\Training\Academy\Statistics (Python)\Cases\Concrete Strength\testConcrete.csv")

###### Fit the best model on the whole data

In [27]:
poly = PolynomialFeatures(degree=3)
lr = LinearRegression()
pipe = Pipeline([('POLY',poly), ('LR', lr)])
pipe.fit(X, y)

## Inferencing

###### Generate the prediction with the best Model

In [28]:
predictions = pipe.predict(tst)
predictions

array([  39.11042923,   -6.2155813 ,  107.15161394, -165.55714369,
        156.18355928, 1044.37605811,  110.18257279,  142.06318532,
        342.40079478,  879.38426123,   62.44514781,  448.7111933 ,
       -116.57260024,  477.85763322])