In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('Abalone.csv')
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole Weight,Shucked Weight,Viscera Weight,Shell Weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [3]:
df2 = df.copy()

##### 1. Convert the continuous output value from continuous to binary (0,1) and build an SVC

In [4]:
#make the target varibales to be binary case
mask = df['Rings'] <= 11
df['Rings'] = np.where(mask, 0, 1)
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole Weight,Shucked Weight,Viscera Weight,Shell Weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,1
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,0
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,0
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,0
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,0


In [5]:
#encode categorical data Sex
from category_encoders import OneHotEncoder
ohe = OneHotEncoder()
X_encoded = ohe.fit_transform(df.iloc[:,0:7])
X_encoded.head()

Unnamed: 0,Sex_1,Sex_2,Sex_3,Length,Diameter,Height,Whole Weight,Shucked Weight,Viscera Weight
0,1,0,0,0.455,0.365,0.095,0.514,0.2245,0.101
1,1,0,0,0.35,0.265,0.09,0.2255,0.0995,0.0485
2,0,1,0,0.53,0.42,0.135,0.677,0.2565,0.1415
3,1,0,0,0.44,0.365,0.125,0.516,0.2155,0.114
4,0,0,1,0.33,0.255,0.08,0.205,0.0895,0.0395


In [6]:
from collections import Counter
print('Original dataset shape {}'.format(Counter(df['Rings'])))

Original dataset shape Counter({0: 3217, 1: 960})


As stated in the problem, other continuous values have been scaled, and ready to be used in SVM. However, it's an unbalanced data set, we need to use SMOTE in trainning data.  

In [7]:
from sklearn.model_selection import train_test_split
y = df['Rings']
X_train, X_test, y_train,y_test = train_test_split(X_encoded, y)

In [8]:
from imblearn.over_sampling import SMOTE

print('Original dataset shape {}'.format(Counter(y)))
sm = SMOTE(random_state=0)
X_train, y_train = sm.fit_sample(X_train, y_train)
print('Resampled dataset shape {}'.format(Counter(y_train)))

Original dataset shape Counter({0: 3217, 1: 960})
Resampled dataset shape Counter({0: 2404, 1: 2404})


##### 2. Using your best guess for hyperparameters and kernel, what is the percentage of correctly classified results?

In [9]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
C= 0.5
gamma = 0.5
svc = SVC(C=C, gamma = gamma).fit(X_train, y_train)
y_pred_svc = svc.predict(X_test)
svc_sco = accuracy_score(y_test, y_pred_svc) *100
print('Accuracy percentage : %.2f'%svc_sco)

Accuracy percentage : 69.00


My guess for C is 0.5 and gamma is 0.5, using this parameters, there are around 68% correctly classified labels.

##### 3. Test different kernels and hyperparameters or consider using sklearn.model_selection.SearchGridCV. Which kernel performed best with what settings?

In [10]:
from sklearn.model_selection import GridSearchCV
parameters = {'kernel':('linear', 'rbf'), 'C':[0.5,1,2], 'gamma':[0.5,1,2]}
grid_cv = GridSearchCV(SVC(), parameters).fit(X_train, y_train)
best_model = grid_cv.best_params_
print('The best model is:', best_model)


The best model is: {'C': 2, 'gamma': 1, 'kernel': 'rbf'}


##### 4.Show recall, precision and f-measure for the best model

In [11]:
from sklearn.metrics import precision_score, recall_score, f1_score
svc_best = SVC(C=2, gamma=2, kernel='rbf').fit(X_train, y_train)
y_pred_best = svc_best.predict(X_test)
precision = precision_score(y_test, y_pred_best)
recall = recall_score(y_test, y_pred_best)
f = f1_score(y_test, y_pred_best)
print('Best score :%.2f' %grid_cv.best_score_)
print('Precision score: %.2f' %precision)
print('Recall score :%.2f'%recall)
print('F1 score :%.2f'%f)

Best score :0.79
Precision score: 0.50
Recall score :0.78
F1 score :0.61


Using the best parameters from GridSearchCV, we see that the score has improved. However, the precision is still very low, suggesting that the model is more likley to classify young abalone to old abalone.

##### 5.Using the original data, with rings as a continuous variable, create an SVR model

In [12]:
#the original data set
df2.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole Weight,Shucked Weight,Viscera Weight,Shell Weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [13]:
#encode the categorical variable data
X_encoded = ohe.fit_transform(df2.iloc[:,0:7])
X_encoded.head()

Unnamed: 0,Sex_1,Sex_2,Sex_3,Length,Diameter,Height,Whole Weight,Shucked Weight,Viscera Weight
0,1,0,0,0.455,0.365,0.095,0.514,0.2245,0.101
1,1,0,0,0.35,0.265,0.09,0.2255,0.0995,0.0485
2,0,1,0,0.53,0.42,0.135,0.677,0.2565,0.1415
3,1,0,0,0.44,0.365,0.125,0.516,0.2155,0.114
4,0,0,1,0.33,0.255,0.08,0.205,0.0895,0.0395


In [14]:
# define the target variable
y = df2['Rings']

In [15]:
from sklearn.svm import SVR
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y)

parameters = {'kernel':['linear'], 'C':[0.5,1,2], 'gamma':[0.5,1,2]}
svr = GridSearchCV(SVR(), parameters).fit(X_train, y_train)
print('Best model:', svr.best_params_)

Best model: {'C': 2, 'gamma': 0.5, 'kernel': 'linear'}


The best SVR model from GridSearchCV is C=2, gamma=2, kernel='poly', we used this model as the best model for following analysis.

##### 6. Report on the predicted variance and the mean squared error

In [16]:
from sklearn.metrics import mean_squared_error, explained_variance_score
best_svr = SVR(C=2, gamma=0.5, kernel='linear').fit(X_train, y_train)
y_pred_svr = best_svr.predict(X_test)
mse = mean_squared_error(y_test, y_pred_svr)
print('Mean squared error :%.2f' %mse)

Mean squared error :5.08


In [17]:
#this only works for linear regression probelm 
def get_prediction_interval(prediction, y_test, test_predictions, pi=.95):
    '''
    Get a prediction interval for a linear regression.
    
    INPUTS: 
        - Single prediction, 
        - y_test
        - All test set predictions,
        - Prediction interval threshold (default = .95) 
    OUTPUT: 
        - Prediction interval for single prediction
    '''
    from scipy import stats
    #get standard deviation of y_test
    sum_errs = np.sum((y_test - test_predictions)**2)
    stdev = np.sqrt(1 / (len(y_test) - 10) * sum_errs)
    #get interval from standard deviation
    one_minus_pi = 1 - pi
    ppf_lookup = 1 - (one_minus_pi / 2)
    z_score = stats.norm.ppf(ppf_lookup)
    interval = z_score * stdev
    #generate prediction interval lower and upper bound
    lower, upper = prediction - interval, prediction + interval
    return ['%.2f'%lower,  '%.2f' %upper, '%.2f'%prediction]

In [18]:
#get prediction interval for the first predicted value
interv= get_prediction_interval(y_pred_svr[0], y_test, y_pred_svr)
print('The prediction interval for', interv[1], 'is from ', interv[0], 'to',interv[2])

The prediction interval for 18.86 is from  9.98 to 14.42


A prediction is a single outcome value given some input variables.

A point prediction is an estimate and contains some uncertainty. The uncertainty comes from the errors in the model itself and noise in the input data. The model is an approximation of the relationship between the input variables and the output variables.

Prediction interval is a quantification of the unertainty on a prediction. It provides a probabilistic upper and lower bounds on the estimate of an outcome variable. (A confidence interval quantifies the uncertainty on an estimated population variable, such as the mean or standard deviation. Whereas a prediction interval quantifies the uncertainty on a single observation estimated from the population.)

Here, I used the first predicted value as an example to calculate the predecition interval for it.

Credit: https://machinelearningmastery.com/prediction-intervals-for-machine-learning/

https://towardsdatascience.com/prediction-intervals-in-linear-regression-2ea14d419981