In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Read the csv file and checking the first five observation

In [3]:
df = pd.read_csv('coalv2.csv')
df.head()

Unnamed: 0,moistr,volmat,fixedc,ash,hydrgn,carbon,nitrgn,oxygen,sulfur,gcv
0,6.98,33.02,57.42,2.58,5.14,71.8,1.13,18.91,0.44,12312
1,11.15,26.39,58.32,4.14,5.05,66.71,1.4,22.1,0.6,11290
2,2.38,29.88,54.64,13.1,4.66,69.69,1.25,6.07,5.23,12674
3,3.38,28.61,40.39,27.62,4.3,55.02,1.15,6.78,5.13,10130
4,10.67,27.92,54.5,6.91,5.43,70.17,1.54,14.8,1.15,12559


In [4]:
df = df.iloc[:5000, ]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   moistr  5000 non-null   float64
 1   volmat  5000 non-null   float64
 2   fixedc  5000 non-null   float64
 3   ash     5000 non-null   float64
 4   hydrgn  5000 non-null   float64
 5   carbon  5000 non-null   float64
 6   nitrgn  5000 non-null   float64
 7   oxygen  5000 non-null   float64
 8   sulfur  5000 non-null   float64
 9   gcv     5000 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 390.8 KB


In [5]:
# selecting features and label
X = df.iloc[:, :-1]  # features
y = df.iloc[:,-1]  #label

In [6]:
[X.shape,y.shape]

[(5000, 9), (5000,)]

___
###  Standardization:
$$x^{'} = \frac{x - \bar{x}}{\sigma}$$

where, 
$\bar{x}$ = mean of a certain feature
$\sigma$ = standard deviation of a certain feature
___


In [7]:
# Standardization
from sklearn.preprocessing import StandardScaler

# transforming X_train value to feed to the model
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# transforming X_test value for predicting for new cases
#X_test_scaled = scaler.transform(X_test)

In [8]:
# function to convert seconds to into hours, minutes and seconds
# this function is used to measure the time taking by the models
def convert(seconds):
    seconds = seconds % (24 * 3600)
    hour = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60
      
    return "%d:%02d:%02d" % (hour, minutes, seconds)

### GridSearchCV (Hyperparameter Tuning)
- parameter that's been selected for tuning
    1. C
    2. gamma
    3. kernel

In [9]:
# Loading the svr model from svm and GridSearchCV from model_selection
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

import time
start = time.time()
# enter your code below this line

regr = GridSearchCV(SVR(), {
    'gamma' : ['scale', 'auto'],
    'C': [1, 5, 10, 15, 20,],
    'kernel': ['rbf','linear', 'poly', 'sigmoid']
}, cv=10, return_train_score=False, scoring=['neg_mean_squared_error', 'r2'], refit='r2')

#Fit the model
regr.fit(X_scaled, y)

# enter you code above this line    
end = time.time()

regr.cv_results_


{'mean_fit_time': array([2.38140121, 1.31229677, 1.42389886, 2.22017622, 1.93991015,
        1.35789051, 1.50719531, 2.56750009, 2.28009381, 1.88149581,
        1.61139445, 1.80939319, 1.91489542, 1.4422487 , 1.3774992 ,
        1.82059553, 1.73469615, 1.95099816, 1.81010044, 1.73428886,
        1.69460292, 1.38539882, 1.36490214, 1.72069921, 1.75099485,
        1.47170005, 1.32979732, 1.88270297, 1.99004741, 1.52799075,
        1.58379755, 2.21429944, 1.99699974, 1.4903018 , 1.51719797,
        1.82259197, 1.74360218, 1.38389816, 1.38699362, 1.72349541]),
 'std_fit_time': array([0.55643637, 0.18012119, 0.11222761, 0.46353101, 0.08668069,
        0.03583796, 0.10850823, 0.48870759, 0.39304528, 0.43703211,
        0.33059442, 0.18098082, 0.19305965, 0.11867576, 0.10009252,
        0.15370575, 0.06584304, 0.34035487, 0.24483629, 0.11481844,
        0.06569023, 0.11232514, 0.17017522, 0.05919281, 0.08238938,
        0.175685  , 0.05223731, 0.18342695, 0.26590363, 0.10477711,
        0.187

In [1]:
# import sklearn
# sorted(sklearn.metrics.SCORERS.keys())

### Total Run Time:

In [10]:
print('run_time:', convert(end-start), 'h:m:s')

run_time: 0:12:49 h:m:s


In [11]:
# converting the above gibrish to a readable format using dataFrame
df = pd.DataFrame(regr.cv_results_)
print(df.shape)
df2 = df.transpose()
df2

(40, 34)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
mean_fit_time,2.381401,1.312297,1.423899,2.220176,1.93991,1.357891,1.507195,2.5675,2.280094,1.881496,...,1.583798,2.214299,1.997,1.490302,1.517198,1.822592,1.743602,1.383898,1.386994,1.723495
std_fit_time,0.556436,0.180121,0.112228,0.463531,0.086681,0.035838,0.108508,0.488708,0.393045,0.437032,...,0.187374,0.108879,0.269216,0.06988,0.103801,0.123568,0.053637,0.03241,0.113284,0.080991
mean_score_time,0.449026,0.092503,0.1027,0.183196,0.4104,0.103507,0.115504,0.1985,0.459104,0.141904,...,0.1213,0.172201,0.381699,0.096297,0.125102,0.142108,0.3497,0.0891,0.102803,0.143103
std_score_time,0.078339,0.010009,0.008338,0.039312,0.056192,0.013076,0.010132,0.044986,0.099175,0.049175,...,0.024871,0.023987,0.045309,0.013132,0.021635,0.007193,0.015639,0.00563,0.00556,0.005703
param_C,1,1,1,1,1,1,1,1,5,5,...,15,15,20,20,20,20,20,20,20,20
param_gamma,scale,scale,scale,scale,auto,auto,auto,auto,scale,scale,...,auto,auto,scale,scale,scale,scale,auto,auto,auto,auto
param_kernel,rbf,linear,poly,sigmoid,rbf,linear,poly,sigmoid,rbf,linear,...,poly,sigmoid,rbf,linear,poly,sigmoid,rbf,linear,poly,sigmoid
params,"{'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}","{'C': 1, 'gamma': 'scale', 'kernel': 'linear'}","{'C': 1, 'gamma': 'scale', 'kernel': 'poly'}","{'C': 1, 'gamma': 'scale', 'kernel': 'sigmoid'}","{'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}","{'C': 1, 'gamma': 'auto', 'kernel': 'linear'}","{'C': 1, 'gamma': 'auto', 'kernel': 'poly'}","{'C': 1, 'gamma': 'auto', 'kernel': 'sigmoid'}","{'C': 5, 'gamma': 'scale', 'kernel': 'rbf'}","{'C': 5, 'gamma': 'scale', 'kernel': 'linear'}",...,"{'C': 15, 'gamma': 'auto', 'kernel': 'poly'}","{'C': 15, 'gamma': 'auto', 'kernel': 'sigmoid'}","{'C': 20, 'gamma': 'scale', 'kernel': 'rbf'}","{'C': 20, 'gamma': 'scale', 'kernel': 'linear'}","{'C': 20, 'gamma': 'scale', 'kernel': 'poly'}","{'C': 20, 'gamma': 'scale', 'kernel': 'sigmoid'}","{'C': 20, 'gamma': 'auto', 'kernel': 'rbf'}","{'C': 20, 'gamma': 'auto', 'kernel': 'linear'}","{'C': 20, 'gamma': 'auto', 'kernel': 'poly'}","{'C': 20, 'gamma': 'auto', 'kernel': 'sigmoid'}"
split0_test_neg_mean_squared_error,-2491157.452097,-25734.13889,-1545017.32421,-1213281.308506,-2495396.981304,-25734.13889,-1523246.662029,-1196579.591023,-1058306.437406,-14178.615942,...,-675589.004694,-1510642.799611,-388173.518184,-13756.341915,-671507.370099,-2172102.912639,-394627.977825,-13756.341915,-672647.665241,-2220030.060676
split1_test_neg_mean_squared_error,-1752506.763586,-28126.398502,-1503634.865313,-872820.861289,-1758824.679906,-28126.398502,-1458482.13823,-842900.801752,-466631.461117,-11485.196613,...,-716595.722921,-537706.520725,-145006.275984,-9607.697111,-714154.064524,-767269.349545,-149341.668857,-9607.697111,-711526.606994,-849757.798475


In [12]:
# getting all parameter combinations and their performance result
result = df[['param_C', 'param_gamma', 'param_kernel',
    'mean_test_r2', 'mean_test_neg_mean_squared_error']]
result.to_csv('performance_result_for_svr_gridsearchCV.csv')
print(result)



   param_C param_gamma param_kernel  mean_test_r2  \
0        1       scale          rbf     -0.030251   
1        1       scale       linear      0.986478   
2        1       scale         poly      0.339700   
3        1       scale      sigmoid      0.488359   
4        1        auto          rbf     -0.025830   
5        1        auto       linear      0.986478   
6        1        auto         poly      0.336841   
7        1        auto      sigmoid      0.482479   
8        5       scale          rbf      0.599963   
9        5       scale       linear      0.992567   
10       5       scale         poly      0.421124   
11       5       scale      sigmoid      0.848384   
12       5        auto          rbf      0.611574   
13       5        auto       linear      0.992567   
14       5        auto         poly      0.412273   
15       5        auto      sigmoid      0.861315   
16      10       scale          rbf      0.780019   
17      10       scale       linear      0.992

In [13]:
# printing the best combination of parameter that perform the best
print(regr.best_params_)

# the best performing score
print(regr.best_score_)

{'C': 15, 'gamma': 'scale', 'kernel': 'linear'}
0.992793249592254


### Prediction

### Saving trained model to a file and reading the same file using pickle module

In [11]:
# import pickle

# with open('trained_svm_model', 'wb') as f:
#    pickle.dump(regressor, f)

#with open('trained_svm_model', 'rb') as f:
#    mp = pickle.load(f)
#    mp.predict(X)

### Another approach for saving and reading the trained model using joblib module
- It is more efficient on objects that carry large numpy arrays internally as is often the case for fitted scikit-learn estimators

In [12]:
#from sklearn.externals import joblib
#from joblib import dump, load
#dump(regressor, 'trained_svm_model.joblib') 

#clf = load('trained_svm_model.joblib')