In [None]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split #for data preprocessing and crass validating 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer

from sklearn.svm import SVC #for SVMs
from sklearn.linear_model import LogisticRegression #logistic Regression
from sklearn.ensemble import RandomForestClassifier #Random Forest 

from statistics import mean
from hyperopt import Trials, hp, fmin, tpe, STATUS_OK, space_eval #for hyperparameter tuning and minimizing

Data Reading and Preprocessing

In [None]:
df = pd.read_csv()
# do one hot encoding
imp = SimpleImputer(missing_values = np.nan, strategy = 'mean') #could also use mean, median, most freq
df = pd.DataFrame(imp.fit_transform(df), columns = df.columns)
x_train, x_test, y_train, y_test = train_test_split(df.drop('target', axis = 1), df['target'])

SVM 

In [None]:
space = {
    'C' : hp.choice('C', [0.5, 1, 10, 100]),
    'gamma' : hp.choice('gamma', [1, 0.1, 0.01, 0.001, 0.0001] + ['scale']),
    'kernel' : hp.choice('kernel', ['rbf', 'poly'])
} #defines the space in which we do hyperparameter tuning for C, gamma and kernel
kfold = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 0)

def objective(params) : #objective function to minimize for hyperparameter tuning
    svc = SVC(**params) #grab all keyword paramaters
    # cross_val_score takes in object to fit, x, y shape, cv generator (in this case kfold), 
    #scoring metric, and number of parallel processings (just do 1 cuz h0m3l355)
    scores = cross_val_score(svc, x_train, y_train, cv = kfold, scoring = 'accuracy', n_jobs = -1)
    print(params)
    best_score = mean(scores) 
    loss = -best_score
    return {'loss': loss, 'params': params, 'status': STATUS_OK}
    
num_trials = Trials()
best = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 30, trials = num_trials)

svc = SVC(C = space_eval(space, best)['C'], gamma = space_eval(space, best)['gamma'], kernel = space_eval(space, best)['kernel'])
svc.fit(x_train, y_train)
svc.score(x_test, y_test)

Logistic Regression

In [2]:
logistic_regression = LogisticRegression(solver = 'liblinear', max_iter = 500) #change solver based on data set
logistic_regression.fit(x_train, y_train)
logistic_regression.score(x_test, y_test)

NameError: name 'LogisticRegression' is not defined

Random Forest

In [None]:
space = {
    'n_estimators': hp.choice('n_estimators', range(50, 150)),
    'max_depth': hp.choice('max_depth', [1, 5, 10, 20, 50, 75, 100, 150, 200]),
    'min_samples_split': hp.choice('min_samples_split', [2, 3, 4, 5, 10, 20]),
    'min_samples_leaf': hp.choice('min_samples_leaf', [1, 2, 3, 4, 5]),
    'bootstrap': hp.choice('bootstrap', [True, False]),
    'criterion': hp.choice('criterion', ['gini', 'entropy']),
    'max_features': hp.choice('max_features', ['sqrt', 'auto', 'log2'])
}

kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0)

def objective(params):
    clf = RandomForestClassifier(**params)
    scores = cross_val_score(clf, x_train, y_train, cv = kfold, scoring = 'accuracy', n_jobs = -1)
    best_score = mean(scores) 
#     print(params)
    loss = -best_score
    return {'loss': loss, 'params': params, 'status': STATUS_OK}

num_trials = Trials()
best = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 300, trials = num_trials)

randomForest = RandomForestClassifier(n_estimators = space_eval(space, best)['n_estimators'], max_depth = space_eval(space, best)['max_depth'],
                                     min_samples_split = space_eval(space, best)['min_samples_split'], min_samples_leaf = space_eval(space, best)['min_samples_leaf'],
                                     bootstrap = space_eval(space, best)['bootstrap'], criterion = space_eval(space, best)['criterion'], max_features = space_eval(space, best)['max_features'])