In [6]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import neighbors
from sklearn.model_selection import cross_val_score
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std

#### Data

The data is obtained from UCI's machine learning repository and contains information on concrete strenght based on various factors.  There are 1,030 rows, and 9 columns.

Link: https://archive.ics.uci.edu/ml/datasets/Concrete+Compressive+Strength

In [7]:
data = pd.read_csv("Concrete_Data.csv")

In [8]:
# Get number of rows and columns
data.shape

(1030, 9)

In [9]:
#View columns - Need to clean up to make easier to work with
data.columns

Index(['Cement (component 1)(kg in a m^3 mixture)',
       'Blast Furnace Slag (component 2)(kg in a m^3 mixture)',
       'Fly Ash (component 3)(kg in a m^3 mixture)',
       'Water  (component 4)(kg in a m^3 mixture)',
       'Superplasticizer (component 5)(kg in a m^3 mixture)',
       'Coarse Aggregate  (component 6)(kg in a m^3 mixture)',
       'Fine Aggregate (component 7)(kg in a m^3 mixture)', 'Age (day)',
       'Concrete compressive strength(MPa, megapascals) '],
      dtype='object')

In [10]:
#See data
data.head()

Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),"Concrete compressive strength(MPa, megapascals)"
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [11]:
#Rename columns ot make easier to work with
data.columns =["cement","slag","ash","water","superplasticizer","coarse","fine","age","strength"]

In [44]:
def print_OLS_output():
    lm = smf.ols(formula=linear_formula, data = data).fit()
    print("Parameters:")
    print(lm.params)
    print("\n P-Values")
    print(lm.pvalues)
    print("\n R-Squared:")
    (print(lm.rsquared))
    return

In [45]:
#Create modified factors
data["age_squared"] = data.age **2
data["ash_modified"] = np.where(data.ash >0,1,0)

In [14]:
#Descriptive statistics for the data
data.describe()

Unnamed: 0,cement,slag,ash,water,superplasticizer,coarse,fine,age,strength,age_squared,ash_modified
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961,6071.594175,0.450485
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742,20195.954706,0.497784
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33,1.0,0.0
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71,49.0,0.0
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445,784.0,0.0
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135,3136.0,1.0
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6,133225.0,1.0


In [35]:
def run_knn():
    knn = neighbors.KNeighborsRegressor(n_neighbors=15, weights='distance')
    knn.fit(X, Y)
    score = cross_val_score(knn, X, Y, cv=3)
    print("KNN Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))
    

In [36]:
#Run KNN for all factors
X = data[["cement","slag","ash","water","superplasticizer","coarse","fine","age"]]
Y = data["strength"]
run_knn()

KNN Accuracy: 0.41 (+/- 0.15)


In [43]:
#Run KNN with model updates from OLS
X = data[["cement","slag","ash","water","superplasticizer","age","age_squared"]]
Y = data["strength"]
run_knn()

KNN Accuracy: 0.52 (+/- 0.15)


In [46]:
#OLS on original model
linear_formula = "strength ~ cement+slag+ash+water+superplasticizer+coarse+fine+age"
print_OLS_output()

Parameters:
Intercept          -23.331214
cement               0.119804
slag                 0.103866
ash                  0.087934
water               -0.149918
superplasticizer     0.292225
coarse               0.018086
fine                 0.020190
age                  0.114222
dtype: float64

 P-Values
Intercept           3.803719e-01
cement              1.897989e-41
slag                1.598993e-23
ash                 5.019648e-12
water               2.008798e-04
superplasticizer    1.810241e-03
coarse              5.442481e-02
fine                5.949075e-02
age                 5.782491e-82
dtype: float64

 R-Squared:
0.615519870414


In [47]:
# OLS after modifying model
linear_formula = "strength ~ cement+slag+water+superplasticizer+age+age_squared+ash_modified"
print_OLS_output()

Parameters:
Intercept           21.709873
cement               0.106857
slag                 0.089972
water               -0.213049
superplasticizer     0.120588
age                  0.353157
age_squared         -0.000811
ash_modified         9.202762
dtype: float64

 P-Values
Intercept            2.217806e-10
cement              5.258523e-161
slag                 2.564561e-92
water                1.825752e-33
superplasticizer     7.878037e-02
age                 3.624730e-151
age_squared          1.199693e-93
ash_modified         4.133177e-30
dtype: float64

 R-Squared:
0.750642742502


Between the two models I like OLS regression because of the ability to easily see p-values and which features are benefiting the model and which aren't.  Seeing the p-values for KNN may be possible but the quick research I found made it seem like they were more complex to reach.  While with OLS with statsmodels it's easy to quickly see. With extensive domain knowledge and experience I imagine KNN may become more useful than I currently see it.  The key challenges(which with experience may become benefits) are in part how much adjusting can be done to KNN.  Being able to choose the number of neighbors as well as whether to use weighting or not may be powerful tools I'll need ot master long term, but as this point I'm still guessing and haven't fully gotten and understanding of how to get the most of either adjustments.  One unfortunate piece to this exercise was that I didn't realize that this data set was in the UCI datbase submitted by someone who did reasearch as to why regression is not an appropriate modeling method for this specific set of data.  Instead they suggested and I briefly glanced through their paper as to how they were able to obtain much better results using neural networks.  However while I was never able to obtain a particularly good rsquared, I was able to make adjustments to the model to improve it versus my first version.