###  Import the Libraries

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.rc('font',size=14)
sns.set(style='white')
sns.set(style='whitegrid',color_codes=True)
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (RandomForestRegressor, GradientBoostingRegressor)
from sklearn.svm import SVR
from sklearn import metrics
from sklearn.ensemble import VotingRegressor
from scipy import stats
from scipy.stats import zscore
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
from sklearn.cluster import KMeans
from sklearn.utils import resample

ModuleNotFoundError: No module named 'seaborn'

#  Load the dataset

In [None]:
#reading the CSV file into pandas dataframe
df=pd.read_csv('concrete (1).csv')

In [None]:
#basic eda
def eda(df):
    print('--------------------------------------------HEAD-----------------------------------------------')
    print(df.head())
    print('--------------------------------------------TAIL-----------------------------------------------')
    print(df.tail())
    print('--------------------------------------------SHAPE-----------------------------------------------')
    print(df.shape)
    print('--------------------------------------------IS_NULL_SUM-----------------------------------------------')
    print(df.isnull().sum())
    print('--------------------------------------------IS_NA_SUM-----------------------------------------------')
    print(df.isna().sum())
    print('--------------------------------------------COLUMNS-----------------------------------------------')
    print(df.columns)
    print('--------------------------------------------DTYPES-----------------------------------------------')
    print(df.dtypes)
    print('--------------------------------------------DESCRIBE-----------------------------------------------')
    print(df.describe())
    print('--------------------------------------------INFO-----------------------------------------------')
    print(df.info())

In [None]:
eda(df)

# 1. Exploratory data quality report

## 1.1. Univariate analysis

### Description of independent attributes

### Cement

#### Range of values observed

In [None]:
print('--------------------------------------------CEMENT-----------------------------------------------')
print('Range of values: ', df['cement'].max()-df['cement'].min())
print('--------------------------------------------SLAG-----------------------------------------------')
print('Range of values: ', df['slag'].max()-df['slag'].min())
print('--------------------------------------------ASH-----------------------------------------------')
print('Range of values: ', df['ash'].max()-df['ash'].min())
print('--------------------------------------------WATER-----------------------------------------------')
print('Range of values: ', df['water'].max()-df['water'].min())
print('--------------------------------------------SUPERPLASTIC-----------------------------------------------')
print('Range of values: ', df['superplastic'].max()-df['superplastic'].min())
print('--------------------------------------------COARSEAGG-----------------------------------------------')
print('Range of values: ', df['coarseagg'].max()-df['coarseagg'].min())
print('--------------------------------------------FINEAGG-----------------------------------------------')
print('Range of values: ', df['fineagg'].max()-df['fineagg'].min())
print('--------------------------------------------AGE-----------------------------------------------')
print('Range of values: ', df['age'].max()-df['age'].min())

#### Central values

In [None]:
for i in df.columns[0:-1]:
    print('---------------------------'+i+' VALUES'+'--------------------------------------')
    Q1=df[i].quantile(q=0.25)
    Q3=df[i].quantile(q=0.75)
    print('1st Quartile (Q1) is: ', Q1)
    print('3st Quartile (Q3) is: ', Q3)
    print('Interquartile range (IQR) is ', stats.iqr(df[i]))
    
    # IQR=Q3-Q1
    #lower 1.5*IQR whisker i.e Q1-1.5*IQR
    #upper 1.5*IQR whisker i.e Q3+1.5*IQR
    L_outliers=Q1-1.5*(Q3-Q1)
    U_outliers=Q3+1.5*(Q3-Q1)
    print('Lower outliers in cement: ', L_outliers)
    print('Upper outliers in cement: ', U_outliers)
    
    print('Number of outliers in '+i+' upper : ', df[df[i]>U_outliers][i].count())
    print('Number of outliers in '+i+' lower : ', df[df[i]<L_outliers][i].count())
    print('% of Outlier in '+i+' upper: ',round(df[df[i]>U_outliers][i].count()*100/len(df)), '%')
    print('% of Outlier in '+i+' lower: ',round(df[df[i]<L_outliers][i].count()*100/len(df)), '%')

In [None]:
for i in df.columns[0:-1]:
    fig, (ax1,ax2,ax3)=plt.subplots(1,3,figsize=(13,5))
    #boxplot
    sns.boxplot(x=i,data=df,orient='v',ax=ax1)
    ax1.set_ylabel(i, fontsize=15)
    ax1.set_title('Distribution of'+i, fontsize=15)
    ax1.tick_params(labelsize=15)

    #distplot
    sns.distplot(df[i],ax=ax2)
    ax2.set_xlabel(i, fontsize=15)
    ax2.set_ylabel('Strength', fontsize=15)
    ax2.set_title(i+' vs Strength', fontsize=15)
    ax2.tick_params(labelsize=15)

    #histogram
    ax3.hist(df[i])
    ax3.set_xlabel(i, fontsize=15)
    ax3.set_ylabel('Strength', fontsize=15)
    ax3.set_title(i+' vs Strength', fontsize=15)
    ax3.tick_params(labelsize=15)

    plt.subplots_adjust(wspace=0.5)
    plt.tight_layout() 

from boxplot We can see observe that :
- distribution of outliers
- quartile range

from distplot and histogram We can see observe that :
- cement is almost normal. 
- slag has  three gausssians and rightly skewed.
- ash has two gaussians and rightly skewed.
- water has three guassians and slighly left skewed.
- superplastic has two gaussians and rightly skewed.
- coarseagg has three guassians and almost normal.
- fineagg has almost two guassians and looks like normal.
- age has multiple guassians and rightly skewed.

#### Range of values observed

In [None]:
for i in df.columns:
    print('---------------'+i+'-----------------')
    print('Range of values: ', df[i].max()-df[i].min())

## 1.2. Multivariate Analysis

In [None]:
# Histogram 
df.hist(figsize=(15,15))

* It is also giving the same information like distance plot.

In [None]:
# pairplot- plot density curve instead of histogram in diagonal
sns.pairplot(df, diag_kind='kde')  

#### Diagonals Analysis
The diagonal gives the same information, we got using distplot.
* cement attribute have almost normal curve.
* slag has  two gausssians and rightly skewed.It shows the presence of outlies.
* ash has two gaussians and rightly skewed.It shows the presence of outlies.
* water has atleast guassians and slighly left skewed.It shows the presence of outlies.
* superplastic has multiple gaussians and rightly skewed.It shows the presence of outlies.
* coarseagg has three guassians and almost normal.
* fineagg has almost two guassians and looks like normal.
* age has multiple guassians and rightly skewed. It shows the presence of outlies.
* strength is close to a normal curve.

We not only have missing values problem but also outliers problem in the dataset.

#### Off Diagonal Analysis: Relationship between indpendent attributes
##### Scatter plots
- cement vs other independent attributes: This attribute does not have any significant relation with slag, ash, water, superplatic, coarseagg,fineagg and age. It almost spread like a cloud. If we had calculated the r value it would have come close to 0.
- slag vs other independent attributes: This attribute also does not have any significant relation with ash, water, superplatic, coarseagg,fineagg and age. It almost spread like a cloud. If we had calculated the r value it would have come close to 0.
- ash vs other independent attributes: This attribute also does not have any significant relation with water, superplatic, coarseagg,fineagg and age. It almost spread like a cloud. If we had calculated the r value it would have come close to 0.
- water vs other independent attributes: This attribute have negative linear relationship with superplastic and fineagg. It does not have any significant relationship with other independent atributes. This is true  as Superplasticizers allows the reduction of water in the concrete upto the extent of 30% without reducing the workability.
- superplastic vs other independent attributes:This attribute have negative linear relationship with water only. It does not have any significant relationship with other independent attributes.
- coarseagg vs other independent attributes:This attribute also does not have any significant relation with any other attributes. It almost spread like a cloud. If we had calculated the r value it would have come close to 0.
- fineagg vs other independent attributes:It has negative linear relationship with water. It does not have any significant relation with any other attributes. It almost spread like a cloud. If we had calculated the r value it would have come close to 0.


#### strength attribute : Relationship between dependent and independent attributes
strength: Now its comparing the target column with all other independent attributes and its showing us very vital information.
- strength vs cement: It is linearly related to the cement. The relationship is positive and we can see that for a given value of cement we have a multiple values of strength. Which one should we pick we don't know. Hence Cement though it has poditive relationship with the strength, it is not a very good predictor. It is a weak predictor.
- strength vs slag: There is no particular trend.
- strength vs ash: There is also no particular trend.
- strength vs age: For a given value of age, we have different values of strength. Hence, It is not a good predictor.
- strength vs superplastic:For a given value of age, we have different values of strength. Hence, It is not a good predictor.
- Other attributes does not give any strong relationship with strength.

Hence, we can see that none of the independent attributes are a good predictors of the strength attribute. There is a no linear relationship between them.

So, we will not use Linear model

In [None]:
# correlation matrix 
cor=df.corr()
cor

* Here, we can see the correlation value between the attributes.

In [None]:
#heatmap
sns.set(font_scale=1.15)
plt.figure(figsize=(14, 10))

sns.heatmap(cor, vmax=.8, linewidths=0.01,
            square=True,annot=True,cmap="BuPu",linecolor="black")
plt.title('Correlation between features');

* It is also giving the same information we observed in pairplot analysis. 
* water shows significant negative relationship with superplastic and fineagg. It also shows some kind of positive relationship with slag and age.


In [None]:
#lm plot
for i in df.columns:
    for j in df.columns:
        if i == j:
            break
        else:
            sns.lmplot(x=i,y=j,data=df)
            plt.show()

## 1.3. Strategies to handle different data challenges

## Checking for Missing Values

In [None]:
#Checking for missing values
df.isnull().sum()

* We can see that there are no missing values.

## Checking for outliers

In [None]:
#Creating copy of original dataset
df1=df.copy()

In [None]:
# again check for outliers in dataset after handling missing values using boxplot
df1.boxplot(figsize=(35,15))

* It also shows that slag, ash, water superplastic, and age contains outliers.

In [None]:
#Number of outliers present in the dataset
print('Number of outliers in cement: ',df1[((df1.cement - df1.cement.mean()) / df1.cement.std()).abs() >3]['cement'].count())
print('Number of outliers in slag: ',df1[((df1.slag - df1.slag.mean()) / df1.slag.std()).abs() >3]['slag'].count())
print('Number of outliers in ash: ',df1[((df1.ash - df1.ash.mean()) / df1.ash.std()).abs() >3]['ash'].count())
print('Number of outliers in water: ',df1[((df1.water - df1.water.mean()) / df1.water.std()).abs() >3]['water'].count())
print('Number of outliers in superplastic: ',df1[((df1.superplastic - df1.superplastic.mean()) / df1.superplastic.std()).abs() >3]['superplastic'].count())
print('Number of outliers in coarseagg: ',df1[((df1.coarseagg - df1.coarseagg.mean()) / df1.coarseagg.std()).abs() >3]['coarseagg'].count())
print('Number of outliers in fineagg: ',df1[((df1.fineagg - df1.fineagg.mean()) / df1.fineagg.std()).abs() >3]['fineagg'].count())
print('Number of outliers in age: ',df1[((df1.age - df1.age.mean()) / df1.age.std()).abs() >3]['age'].count())

* Here, we have used Standard deviation method to detect the outliers.If we have any data point that is more than 3 times the standard deviation, then those points are very likely to be outliers.
* We can see that slag, water, superplastic and age contain outliers.

In [None]:
#Records which contains the outliers in slag attribute
print('Records containing outliers in slag: \n',df1[((df1.slag - df1.slag.mean()) / df1.slag.std()).abs() >3]['slag'])

In [None]:
#Records which contains the outliers in water attribute
print('Records containing outliers in water: \n',df1[((df1.water - df1.water.mean()) / df1.water.std()).abs() >3]['water'])

In [None]:
#Records which contains the outliers in superplastic attribute
print('Records containing outliers in superplastic: \n',df1[((df1.superplastic - df1.superplastic.mean()) / df1.superplastic.std()).abs() >3]['superplastic'])

In [None]:
#Records which contains the outliers in age attribute
print('Records containing outliers in age: \n',df1[((df1.age - df1.age.mean()) / df1.age.std()).abs() >3]['age'])

## Handling the outliers

In [None]:
#Replacing the outliers by median
for col_name in df1.columns[:-1]:
    q1 = df1[col_name].quantile(0.25)
    q3 = df1[col_name].quantile(0.75)
    iqr = q3 - q1
    
    low = q1-1.5*iqr
    high = q3+1.5*iqr
    df1.loc[(df1[col_name] < low) | (df1[col_name] > high), col_name] = df1[col_name].median()

In [None]:
# again check for outliers in dataset using boxplot
df1.boxplot(figsize=(35,15))

In [None]:
#Number of outliers present in the dataset
print('Number of outliers in cement: ',df1[((df1.cement - df1.cement.mean()) / df1.cement.std()).abs() >3]['cement'].count())
print('Number of outliers in slag: ',df1[((df1.slag - df1.slag.mean()) / df1.slag.std()).abs() >3]['slag'].count())
print('Number of outliers in ash: ',df1[((df1.ash - df1.ash.mean()) / df1.ash.std()).abs() >3]['ash'].count())
print('Number of outliers in water: ',df1[((df1.water - df1.water.mean()) / df1.water.std()).abs() >3]['water'].count())
print('Number of outliers in superplastic: ',df1[((df1.superplastic - df1.superplastic.mean()) / df1.superplastic.std()).abs() >3]['superplastic'].count())
print('Number of outliers in coarseagg: ',df1[((df1.coarseagg - df1.coarseagg.mean()) / df1.coarseagg.std()).abs() >3]['coarseagg'].count())
print('Number of outliers in fineagg: ',df1[((df1.fineagg - df1.fineagg.mean()) / df1.fineagg.std()).abs() >3]['fineagg'].count())
print('Number of outliers in age: ',df1[((df1.age - df1.age.mean()) / df1.age.std()).abs() >3]['age'].count())

# 2. Feature Engineering

### Scaling the features

In [None]:
#Scaling the dataset
df_z = df1.apply(zscore)
df_z=pd.DataFrame(df_z,columns=df.columns)

* Here, all the attributes in the same scale(unit) except the age attribute. Hence, we are scaling the attributes. We are using zscore for scaling.

## Splitting the data into independent and dependent attributes

In [None]:
#independent and dependent variables
X=df_z.iloc[:,0:8]
y = df_z.iloc[:,8]

## Splitting the data into three sets

In [None]:
# Split X and y into training and test set in 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 1)

In [None]:
X_train.shape,X_test.shape

# 3. Model Building
# 4. Model Tuning

## DecisionTree Regression

In [None]:
model = DecisionTreeRegressor()
model.fit(X_train , y_train)

In [None]:
#printing the feature importance
print('Feature importances: \n',pd.DataFrame(model.feature_importances_,columns=['Imp'],index=X_train.columns))

* So, cement, age and water are significant attributes.
* Here, ash, coarseagg, fineagg, superplastic and slag are the less significant variable.These will impact less to the strength column. This we have seen in pairplot also.

In [None]:
y_pred = model.predict(X_test)
# performance on train data
print('Performance on training data using DT:',model.score(X_train,y_train))
# performance on test data
print('Performance on testing data using DT:',model.score(X_test,y_test))
#Evaluate the model using accuracy
acc_DT=metrics.r2_score(y_test, y_pred)
print('Accuracy DT: ',acc_DT)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))

* There is a overfitting in the model as the dataset is performing 99% accurately in trainnig data. However, the accuracy on test data drops.

In [None]:
#Store the accuracy results for each model in a dataframe for final comparison
results = pd.DataFrame({'Method':['Decision Tree'], 'accuracy': acc_DT},index={'1'})
results = results[['Method', 'accuracy']]
results

In [None]:
num_folds = 10
seed = 77
kfold = KFold(n_splits=num_folds, random_state=None)
results1 = cross_val_score(model,X, y, cv=kfold)
accuracy=np.mean(abs(results1))
print('Average accuracy: ',accuracy)
print('Standard Deviation: ',results1.std())

In [None]:
#Store the accuracy results for each model in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Method':['Decision Tree k fold'], 'accuracy': [accuracy]},index={'2'})
results = pd.concat([results, tempResultsDf])
results = results[['Method', 'accuracy']]
results

### Iteration 2

### Drop the least significant variable

In [None]:
df_z.info()

In [None]:
#Create a copy of the dataset
df2=df_z.copy()

In [None]:
#independent and dependent variable
X = df2.drop( ['strength','ash','coarseagg','fineagg'] , axis=1)
y = df2['strength']
# Split X and y into training and test set in 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 1)

In [None]:
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train , y_train)

In [None]:
#printing the feature importance
print('Feature importances: \n',pd.DataFrame(dt_model.feature_importances_,columns=['Imp'],index=X_train.columns))

In [None]:
y_pred = dt_model.predict(X_test)
# performance on train data
print('Performance on training data using DT:',dt_model.score(X_train,y_train))
# performance on test data
print('Performance on testing data using DT:',dt_model.score(X_test,y_test))
#Evaluate the model using accuracy
acc_DT=metrics.r2_score(y_test, y_pred)
print('Accuracy DT: ',acc_DT)

In [None]:
#Store the accuracy results for each model in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Method':['Decision Tree2'], 'accuracy': [acc_DT]},index={'3'})
results = pd.concat([results, tempResultsDf])
results = results[['Method', 'accuracy']]
results

* The acuracy on testing dataset is not improved, still it is an overfit model.

## calculating value of GINI IMPURITY 

In [None]:
def gini_impurity(y):
    # calculate gini_impurity given labels/classes of each example
    m = y.shape[0]
    cnts = dict(zip(*np.unique(y, return_counts = True)))
    impurity = 1 - sum((cnt/m)**2 for cnt in cnts.values())
    return impurity


In [None]:
gini_impurity(X_train)

* small value of gini index means data in decision tree is highly unordered therefore
  problem with this model is poor ordering of data

### K fold cross validation

## Pruning of Decision Tree

In [None]:
#independent and dependent variables
X=df_z.iloc[:,0:8]
y = df_z.iloc[:,8]
# Split X and y into training and test set in 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 1)

In [None]:
# Regularizing the Decision tree classifier and fitting the model
reg_dt_model = DecisionTreeRegressor( max_depth = 4,random_state=1,min_samples_leaf=5)
reg_dt_model.fit(X_train, y_train)

In [None]:
print (pd.DataFrame(reg_dt_model.feature_importances_, columns = ["Imp"], index = X_train.columns))

* Here, we can see that ash,coarseagg and fineagg are least significant variable.

### K fold cross validation

In [None]:
num_folds = 10
seed = 77
kfold = KFold(n_splits=num_folds, random_state=None)
results1 = cross_val_score(reg_dt_model,X, y, cv=kfold)
accuracy=np.mean(abs(results1))
print('Average accuracy: ',accuracy)
print('Standard Deviation: ',results1.std())

In [None]:
#Store the accuracy results for each model in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Method':['Pruned Decision Tree k fold'], 'accuracy': [accuracy]},index={'5'})
results = pd.concat([results, tempResultsDf])
results = results[['Method', 'accuracy']]
results

### Iteration2

In [None]:
#Create a copy of the dataset
df3=df_z.copy()

In [None]:
#independent and dependent variable
X = df3.drop( ['strength','ash','coarseagg','fineagg'] , axis=1)
y = df3['strength']
# Split X and y into training and test set in 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 1)

In [None]:
# Regularizing the Decision tree classifier and fitting the model
reg_dt_model = DecisionTreeRegressor( max_depth = 4,random_state=1,min_samples_leaf=5)
reg_dt_model.fit(X_train, y_train)

In [None]:
y_pred = reg_dt_model.predict(X_test)
# performance on train data
print('Performance on training data using DT:',reg_dt_model.score(X_train,y_train))
# performance on test data
print('Performance on testing data using DT:',reg_dt_model.score(X_test,y_test))
#Evaluate the model using accuracy
acc_RDT=metrics.r2_score(y_test, y_pred)
print('Accuracy DT: ',acc_RDT)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))

In [None]:
#Store the accuracy results for each model in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Method':['Pruned Decision Tree2'], 'accuracy': [acc_RDT]},index={'6'})
results = pd.concat([results, tempResultsDf])
results = results[['Method', 'accuracy']]
results

## Random Forest Regressor

In [None]:
model=RandomForestRegressor()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
# performance on train data
print('Performance on training data using RFR:',model.score(X_train,y_train))
# performance on test data
print('Performance on testing data using RFR:',model.score(X_test,y_test))
#Evaluate the model using accuracy
acc_RFR=metrics.r2_score(y_test, y_pred)
print('Accuracy DT: ',acc_RFR)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))

* This model is also overfit.

In [None]:
#Store the accuracy results for each model in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Method':['Random Forest Regressor'], 'accuracy': [acc_RFR]},index={'7'})
results = pd.concat([results, tempResultsDf])
results = results[['Method', 'accuracy']]
results

### K fold cross validation

In [None]:
num_folds = 10
seed = 77
kfold = KFold(n_splits=num_folds, random_state=None)
results1 = cross_val_score(model,X, y, cv=kfold)
accuracy=np.mean(abs(results1))
print('Average accuracy: ',accuracy)
print('Standard Deviation: ',results1.std())

In [None]:
#Store the accuracy results for each model in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Method':['Random Forest Regressor k fold'], 'accuracy': [accuracy]},index={'8'})
results = pd.concat([results, tempResultsDf])
results = results[['Method', 'accuracy']]
results

## Gradient Boosting Regressor

In [None]:
model=GradientBoostingRegressor()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
# performance on train data
print('Performance on training data using GBR:',model.score(X_train,y_train))
# performance on test data
print('Performance on testing data using GBR:',model.score(X_test,y_test))
#Evaluate the model using accuracy
acc_GBR=metrics.r2_score(y_test, y_pred)
print('Accuracy DT: ',acc_GBR)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))

In [None]:
#Store the accuracy results for each model in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Method':['Gradient Boost Regressor'], 'accuracy': [acc_GBR]},index={'9'})
results = pd.concat([results, tempResultsDf])
results = results[['Method', 'accuracy']]
results

### K fold cross validation

In [None]:
num_folds = 10
seed = 77
kfold = KFold(n_splits=num_folds, random_state=None)
results1 = cross_val_score(model,X, y, cv=kfold)
accuracy=np.mean(abs(results1))
print('Average accuracy: ',accuracy)
print('Standard Deviation: ',results1.std())

In [None]:
#Store the accuracy results for each model in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Method':['Gradient Boost Regressor k fold'], 'accuracy': [accuracy]},index={'10'})
results = pd.concat([results, tempResultsDf])
results = results[['Method', 'accuracy']]
results

## KNN Regressor

In [None]:
error=[]
for i in range(1,30):
    knn = KNeighborsRegressor(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error.append(np.mean(pred_i!=y_test))

In [None]:
plt.figure(figsize=(12,6))
plt.plot(range(1,30),error,color='red', linestyle='dashed',marker='o',markerfacecolor='blue',markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean error')

In [None]:
#k=3
model = KNeighborsRegressor(n_neighbors=3)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
# performance on train data
print('Performance on training data using KNNR:',model.score(X_train,y_train))
# performance on test data
print('Performance on testing data using KNNR:',model.score(X_test,y_test))
#Evaluate the model using accuracy
acc_K=metrics.r2_score(y_test, y_pred)
print('Accuracy KNNR: ',acc_K)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))

In [None]:
#Store the accuracy results for each model in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Method':['KNN Regressor'], 'accuracy': [acc_K]},index={'15'})
results = pd.concat([results, tempResultsDf])
results = results[['Method', 'accuracy']]
results

### K fold cross validation

In [None]:
num_folds = 10
seed = 77
kfold = KFold(n_splits=num_folds, random_state=None)
results1 = cross_val_score(model,X, y, cv=kfold)
accuracy=np.mean(abs(results1))
print('Average accuracy: ',accuracy)
print('Standard Deviation: ',results1.std())

In [None]:
#Store the accuracy results for each model in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Method':['KNN Regressor k fold'], 'accuracy': [accuracy]},index={'16'})
results = pd.concat([results, tempResultsDf])
results = results[['Method', 'accuracy']]
results

## Support Vector Regressor

In [None]:
model = SVR(kernel='linear')
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
# performance on train data
print('Performance on training data using SVR:',model.score(X_train,y_train))
# performance on test data
print('Performance on testing data using SVR:',model.score(X_test,y_test))
#Evaluate the model using accuracy
acc_S=metrics.r2_score(y_test, y_pred)
print('Accuracy SVR: ',acc_S)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))

In [None]:
#Store the accuracy results for each model in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Method':['Support Vector Regressor'], 'accuracy': [acc_S]},index={'17'})
results = pd.concat([results, tempResultsDf])
results = results[['Method', 'accuracy']]
results

### K fold cross validation

In [None]:
num_folds = 10
seed = 77
kfold = KFold(n_splits=num_folds, random_state=None)
results1 = cross_val_score(model,X, y, cv=kfold)
accuracy=np.mean(abs(results1))
print('Average accuracy: ',accuracy)
print('Standard Deviation: ',results1.std())

In [None]:
#Store the accuracy results for each model in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Method':['SVR k fold'], 'accuracy': [accuracy]},index={'18'})
results = pd.concat([results, tempResultsDf])
results = results[['Method', 'accuracy']]
results

## Ensemeble KNN Regressor, SVR, LR

In [None]:
#Multiple model Ensemble
from sklearn import svm
LR=LinearRegression()
KN=KNeighborsRegressor(n_neighbors=3)
SVM=svm.SVR(kernel='linear') 

In [None]:
evc=VotingRegressor(estimators=[('LR',LR),('KN',KN),('SVM',SVM)])
evc.fit(X_train, y_train)

In [None]:
y_pred = evc.predict(X_test)
# performance on train data
print('Performance on training data using ensemble:',evc.score(X_train,y_train))
# performance on test data
print('Performance on testing data using ensemble:',evc.score(X_test,y_test))
#Evaluate the model using accuracy
acc_E=metrics.r2_score(y_test, y_pred)
print('Accuracy ensemble: ',acc_E)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))

In [None]:
#Store the accuracy results for each model in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Method':['Ensemble'], 'accuracy': [acc_E]},index={'19'})
results = pd.concat([results, tempResultsDf])
results = results[['Method', 'accuracy']]
results

### K fold cross validation

In [None]:
num_folds = 10
seed = 77
kfold = KFold(n_splits=num_folds, random_state=None)
results1 = cross_val_score(evc,X, y, cv=kfold)
accuracy=np.mean(abs(results1))
print('Average accuracy: ',accuracy)
print('Standard Deviation: ',results1.std())

In [None]:
#Store the accuracy results for each model in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Method':['Ensemble k fold'], 'accuracy': [accuracy]},index={'20'})
results = pd.concat([results, tempResultsDf])
results = results[['Method', 'accuracy']]
results

* After applying all the models we can see that Random Forest Regressor, Random Forest Regressor k fold, Gradient Boost Regressor, Gradient Boost Regressor k fold are giving better results as compared to other models.
* Now as the dataset have different gaussians, we can apply k means clustering and then we can apply the models and compare the accuracy.

## Bootstrap Sampling

In [None]:
concrete_XY = X.join(y)

### Using Gradient Boosting Regressor

In [None]:
values = concrete_XY.values
# Number of bootstrap samples to create
n_iterations = 1000        
# size of a bootstrap sample
n_size = int(len(df_z) * 1)    

# run bootstrap
# empty list that will hold the scores for each bootstrap iteration
stats = list()   
for i in range(n_iterations):
    # prepare train and test sets
    train = resample(values, n_samples=n_size)  # Sampling with replacement 
    test = np.array([x for x in values if x.tolist() not in train.tolist()])  # picking rest of the data not considered in sample
    
    
     # fit model
    gbmTree = GradientBoostingRegressor(n_estimators=50)
    # fit against independent variables and corresponding target values
    gbmTree.fit(train[:,:-1], train[:,-1]) 
    # Take the target column for all rows in test set

    y_test = test[:,-1]    
    # evaluate model
    # predict based on independent variables in the test data
    score = gbmTree.score(test[:, :-1] , y_test)
    predictions = gbmTree.predict(test[:, :-1])  

    stats.append(score)

### Using Random Forest Regressor

In [None]:
values = concrete_XY.values
# Number of bootstrap samples to create
n_iterations = 1000        
# size of a bootstrap sample
n_size = int(len(df_z) * 1)    

# run bootstrap
# empty list that will hold the scores for each bootstrap iteration
stats = list()   
for i in range(n_iterations):
    # prepare train and test sets
    train = resample(values, n_samples=n_size)  # Sampling with replacement 
    test = np.array([x for x in values if x.tolist() not in train.tolist()])  # picking rest of the data not considered in sample
    
    
     # fit model
    rfTree = RandomForestRegressor(n_estimators=100)
    # fit against independent variables and corresponding target values
    rfTree.fit(train[:,:-1], train[:,-1]) 
    # Take the target column for all rows in test set

    y_test = test[:,-1]    
    # evaluate model
    # predict based on independent variables in the test data
    score = rfTree.score(test[:, :-1] , y_test)
    predictions = rfTree.predict(test[:, :-1])  

    stats.append(score)

# 5. Model performance range at 95% confidence level 

In [None]:
# plot scores

from matplotlib import pyplot
pyplot.hist(stats)
pyplot.show()
# confidence intervals
alpha = 0.95                             # for 95% confidence 
p = ((1.0-alpha)/2.0) * 100              # tail regions on right and left .25 on each side indicated by P value (border)
lower = max(0.0, np.percentile(stats, p))  
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(stats, p))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

The bootstrap random forest  classification model performance is between 84.6%-90.5% which is better than other classification algorithms.