In [1]:
#consider the class Blood-Donor and Non-Blood-Donor to be predicted.

#(c) Optimize and tune the pipeline: use GridSearchCV to search:
#• the best feature scaling technique (try MinMaxScaler() and StandardScaler())
#• the best parameter(s) for feature selection technique (or try different techniques)
#• the best classifier parameter(s) (you may also try different classifiers).

#read the data
import pandas as pd
import numpy as np

#read the data
df = pd.read_csv('hcvdat0.csv')

#drop string data
df = df.drop(['Unnamed: 0'], axis=1)

#change categorical data to float data
#consider the class Blood-Donor and Non-Blood-Donor

df['Category'] = df['Category'].map({'0=Blood Donor': 1, '0s=suspect Blood Donor': 0, '1=Hepatitis': 0, '2=Fibrosis': 0,'3=Cirrhosis': 0})

#change sex data to float data
df['Sex']= df['Sex'].map({'m':0, 'f':1})

#fix missing data
#replace missing data with mean
df = df.fillna(df.mean())


#Setup a machine learning pipeline
#1. Feature Scaling
#2. Feature Selection
#3. Classification

#1. Feature Scaling
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

#2. Feature Selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

#3. Classification
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

#4. Pipeline
from sklearn.pipeline import Pipeline

#5. GridSearchCV
from sklearn.model_selection import GridSearchCV


#split the data
from sklearn.model_selection import train_test_split
X = df.drop(['Category'], axis=1)
y = df['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#parameters for GridSearchCV
parameters = [
    {
        'scaler': [MinMaxScaler(), StandardScaler()],
        #use f_classif scoring function	
        'feature_selection': [SelectKBest(f_classif)],
        'feature_selection__k': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        'classifier': [LogisticRegression()],
        'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    },
]

#pipeline
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('feature_selection', SelectKBest(f_classif)),
    ('classifier', LogisticRegression())
])


#GridSearchCV
grid_search = GridSearchCV(pipeline, parameters, n_jobs=1, verbose=1, cv=5)
grid_search.fit(X_train, y_train)

#best parameters
print ("Best feature scaling technique: ", grid_search.best_estimator_.get_params()['scaler'])
print ("Best parameter(s) for feature selection technique: ", grid_search.best_estimator_.get_params()['feature_selection__k'])
print ("Best classifier parameter(s): ", grid_search.best_estimator_.get_params()['classifier__C'])



























Fitting 5 folds for each of 140 candidates, totalling 700 fits
Best feature scaling technique:  StandardScaler()
Best parameter(s) for feature selection technique:  1
Best classifier parameter(s):  100
