# Hyperparameter Optimization for Sklearn Algorithms using GridSearch 

The scripts below perform GridSearch on SVM and RandomForest algorithms respectively with a small set of the original dataset.  
The most optimal parameter set is given in the output. 

In [2]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from preprocessing.preprocess import Preprocesser

DATASET = "data/sentences_all.csv"

class SVMGridSearch():
    def __init__(self, X, y):
        self.tfidf = TfidfVectorizer(analyzer=(lambda x: x),
                                     tokenizer=(lambda x: x))
        X = X[:200]
        y = y[:200]
        X = self.tfidf.fit_transform(list(X)).toarray()
        self.X = X
        self.y = [label[5] for label in y]

    def train(self):
            X_train, X_test, y_train, y_test = train_test_split(
                self.X, self.y, test_size=0.3, shuffle=True, stratify=self.y, random_state=42)
            rus = RandomUnderSampler(random_state=42)
            X_train, y_train = rus.fit_resample(X_train, y_train)
            #Create a svm Classifier and hyper parameter tuning
            ml = svm.SVC()

            # defining parameter range
            param_grid = {'C': [1, 10, 100, 1000, 10000],
                        'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                        'kernel': ['rbf', 'linear', 'poly', 'sigmoid']}

            grid = GridSearchCV(ml, param_grid, refit=True, verbose=5, cv=None)

            # fitting the model for grid search
            grid_search = grid.fit(X_train, y_train)
            print(grid_search.best_params_)


# set preprocess parameters
preprocesser = Preprocesser(data_path=DATASET, remove_punct=True, lower_case=True,
                            remove_stop_words=False, stemming=False)

# get preprocessed data
X, y = preprocesser.get_data_features_labels()

model = SVMGridSearch(X, y)
model.train()


Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 1/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.429 total time=   0.0s
[CV 2/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.429 total time=   0.0s
[CV 3/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.833 total time=   0.0s
[CV 4/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.500 total time=   0.0s
[CV 5/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.667 total time=   0.0s
[CV 1/5] END .......C=1, gamma=1, kernel=linear;, score=0.429 total time=   0.0s
[CV 2/5] END .......C=1, gamma=1, kernel=linear;, score=0.714 total time=   0.0s
[CV 3/5] END .......C=1, gamma=1, kernel=linear;, score=0.833 total time=   0.0s
[CV 4/5] END .......C=1, gamma=1, kernel=linear;, score=0.500 total time=   0.0s
[CV 5/5] END .......C=1, gamma=1, kernel=linear;, score=0.667 total time=   0.0s
[CV 1/5] END .........C=1, gamma=1, kernel=poly;, score=0.429 total time=   0.0s
[CV 2/5] END .........C=1, gamma=1, kernel=pol

In [3]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier as RandomForestClf
from sklearn.feature_extraction.text import TfidfVectorizer
from preprocessing.preprocess import Preprocesser

DATASET = "data/sentences_all.csv"

class RandomForestGridSearch():
    def __init__(self, X, y):
        self.tfidf = TfidfVectorizer(analyzer=(lambda x: x),
                                     tokenizer=(lambda x: x))
        X = X[:200]
        y = y[:200]
        X = self.tfidf.fit_transform(list(X)).toarray()
        self.X = X
        self.y = [label[5] for label in y]

    def train(self):
        X_train, X_test, y_train, y_test = train_test_split(
            self.X, self.y, test_size=0.3, shuffle=True, stratify=self.y, random_state=42)
        rus = RandomUnderSampler(random_state=42)
        X_train, y_train = rus.fit_resample(X_train, y_train)
        ml = RandomForestClf()

        # defining parameter range
        param_grid = { 
                'n_estimators': [100, 200, 500],
                'max_features': ['auto', 'sqrt', 'log2'],
                'max_depth' : [None,4,6,8],
                'criterion' :['gini', 'entropy']
        }

        grid = GridSearchCV(ml, param_grid, refit=True, verbose=5, cv=None)

        # fitting the model for grid search
        grid_search = grid.fit(X_train, y_train)
        print(grid_search.best_params_)


# set preprocess parameters
preprocesser = Preprocesser(data_path=DATASET, remove_punct=True, lower_case=True,
                            remove_stop_words=False, stemming=False)

# get preprocessed data
X, y = preprocesser.get_data_features_labels()

model = RandomForestGridSearch(X, y)
model.train()


Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV 1/5] END criterion=gini, max_depth=None, max_features=auto, n_estimators=100;, score=0.429 total time=   0.2s
[CV 2/5] END criterion=gini, max_depth=None, max_features=auto, n_estimators=100;, score=0.714 total time=   0.3s
[CV 3/5] END criterion=gini, max_depth=None, max_features=auto, n_estimators=100;, score=0.500 total time=   0.4s
[CV 4/5] END criterion=gini, max_depth=None, max_features=auto, n_estimators=100;, score=0.500 total time=   0.1s
[CV 5/5] END criterion=gini, max_depth=None, max_features=auto, n_estimators=100;, score=0.667 total time=   0.1s
[CV 1/5] END criterion=gini, max_depth=None, max_features=auto, n_estimators=200;, score=0.286 total time=   0.4s
[CV 2/5] END criterion=gini, max_depth=None, max_features=auto, n_estimators=200;, score=0.714 total time=   0.3s
[CV 3/5] END criterion=gini, max_depth=None, max_features=auto, n_estimators=200;, score=0.667 total time=   0.3s
[CV 4/5] END criterion=gin