### Analytics Vidhya: Practice Problem (Approach)

In [3]:
import os
import re
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [4]:
data = pd.read_csv('../flask_api/models/data/heat/training.csv')

In [5]:
data.head()

Unnamed: 0,Temperature,Value,Quality
0,-1.0,4.220657,0
1,-1.0,9.330304,0
2,-1.0,10.069158,1
3,-1.0,9.5798,0
4,-1.0,7.104988,0


In [6]:
print("Shape of the data is:{}".format(data.shape))

Shape of the data is:(633, 3)


In [7]:
print("List of columns is: {}".format(list(data.columns)))

List of columns is: ['Temperature', 'Value', 'Quality']


We'll check out the values (labels) for the columns having missing values:

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin

class PreProcessing(BaseEstimator, TransformerMixin):
    """Custom Pre-Processing estimator for our use-case
    """

    def __init__(self):
        pass

    def transform(self, df):
        
        return df[['Temperature', 'Value']].as_matrix()

    def fit(self, df, y=None, **fit_params):
        """Fitting the Training dataset & calculating the required values from train
           e.g: We will need the mean of X_train['Loan_Amount_Term'] that will be used in
                transformation of X_test
        """
        
#         self.term_mean_ = df['Loan_Amount_Term'].mean()
#         self.amt_mean_ = df['LoanAmount'].mean()
        return self

To make sure that this works, let's do a test run for it:

In [9]:
preprocess = PreProcessing()

In [10]:
preprocess

PreProcessing()

In [11]:
preprocess.fit(data)

PreProcessing()

In [12]:
X_train_transformed = preprocess.transform(data)

In [13]:
X_train_transformed

array([[ -1.        ,   4.22065667],
       [ -1.        ,   9.330304  ],
       [ -1.        ,  10.069158  ],
       ..., 
       [ -8.        ,   0.04692   ],
       [ -8.        ,   0.02131   ],
       [ -9.        ,   0.09675   ]])

So our small experiment to write a custom `estimator` worked. This would be helpful further.

In [14]:
param_grid = {"randomforestclassifier__n_estimators" : [10, 20, 30],
             "randomforestclassifier__max_depth" : [None, 6, 8, 10],
             "randomforestclassifier__max_leaf_nodes": [None, 5, 10, 20], 
             "randomforestclassifier__min_impurity_split": [0.1, 0.2, 0.3]}

In [24]:
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC

pipe = make_pipeline(PreProcessing(),
                    SVC(gamma='auto'))

In [25]:
pipe

Pipeline(memory=None,
     steps=[('preprocessing', PreProcessing()), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [39]:
clf = pipe.fit(data, data.Quality.as_matrix())

In [42]:
clf.predict(data.tail(40))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [43]:
data.tail(40)

Unnamed: 0,Temperature,Value,Quality
593,5.0,3.389274,0
594,5.0,3.376544,0
595,5.0,3.446004,0
596,5.0,3.473496,0
597,5.0,3.475176,0
598,5.0,3.479912,0
599,5.0,3.425502,0
600,5.0,3.455494,0
601,5.0,3.40173,0
602,5.0,3.41547,0


In [27]:
rds = [
    {
        'id': 'a',
        'Temperature': -1,
        'Value': 2000000
    },
    {
        'id': 'b',
        'Temperature': -1,
        'Value': 7
    },
]

In [28]:
ppp = pd.DataFrame(data=rds)

In [38]:
clf = SVC(gamma='scale')
clf.fit(X=data[['Temperature', 'Value']].as_matrix(), y=data['Quality'].as_matrix())

TypeError: a float is required

In [37]:
clf.predict([[-1,20000]])

array([0])

In [29]:
clf.predict(ppp)

array([0, 0])

In [30]:
clf.predict(data.head())

array([0, 0, 0, 0, 0])

In [31]:
data.Quality.as_matrix()

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0,