In [158]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import neighbors
from sklearn.model_selection import cross_val_score
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std

#### Data

The data is obtained from UCI's machine learning repository and contain information on concrete strenght based on various factors.

Link: https://archive.ics.uci.edu/ml/datasets/Concrete+Compressive+Strength

In [153]:
data = pd.read_csv("Concrete_Data.csv")

In [154]:
data.shape

(1030, 9)

In [155]:
data.columns

Index(['Cement (component 1)(kg in a m^3 mixture)',
       'Blast Furnace Slag (component 2)(kg in a m^3 mixture)',
       'Fly Ash (component 3)(kg in a m^3 mixture)',
       'Water  (component 4)(kg in a m^3 mixture)',
       'Superplasticizer (component 5)(kg in a m^3 mixture)',
       'Coarse Aggregate  (component 6)(kg in a m^3 mixture)',
       'Fine Aggregate (component 7)(kg in a m^3 mixture)', 'Age (day)',
       'Concrete compressive strength(MPa, megapascals) '],
      dtype='object')

In [14]:
data.head()

Unnamed: 0,cement,Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),"Concrete compressive strength(MPa, megapascals)"
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [156]:
data.columns =["cement","slag","ash","water","superplasticizer","coarse","fine","age","strength"]

In [201]:
data["age_squared"] = data.age **2

In [202]:
data.describe()

Unnamed: 0,cement,slag,ash,water,superplasticizer,coarse,fine,age,strength,age_squared
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961,6071.594175
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742,20195.954706
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33,1.0
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71,49.0
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445,784.0
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135,3136.0
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6,133225.0


In [200]:
knn = neighbors.KNeighborsRegressor(n_neighbors=15, weights='distance')
X = data[["cement","slag","ash","water","superplasticizer","coarse","fine","age"]]
Y = data["strength"]
knn.fit(X, Y)
score = cross_val_score(knn, X, Y, cv=3)
print("Weighted Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))

Weighted Accuracy: 0.41 (+/- 0.15)


In [244]:
knn = neighbors.KNeighborsRegressor(n_neighbors=15, weights='distance')
X = data[["cement","slag","ash","water","superplasticizer","age","age_squared"]]
Y = data["strength"]
knn.fit(X, Y)
score = cross_val_score(knn, X, Y, cv=10)
print("Weighted Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))

Weighted Accuracy: 0.64 (+/- 0.52)


In [207]:
# OLS
linear_formula = "strength ~ cement+slag+ash+water+superplasticizer+age+age_squared"
lm = smf.ols(formula=linear_formula, data = data).fit()


In [208]:
lm.params

Intercept           21.043748
cement               0.106594
slag                 0.087160
ash                  0.064582
water               -0.206101
superplasticizer     0.179027
age                  0.353197
age_squared         -0.000816
dtype: float64

In [209]:
lm.pvalues

Intercept            1.620559e-09
cement              1.534229e-147
slag                 1.108314e-84
ash                  1.974587e-23
water                7.084681e-31
superplasticizer     9.690505e-03
age                 5.686029e-148
age_squared          2.005539e-92
dtype: float64

In [210]:
lm.rsquared

0.74308624170634974