In [1]:
import pickle
from pathlib import Path
import pandas as pd
import numpy as np
from collections import namedtuple
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.compose import ColumnTransformer

### Pipeline + nested cross validation + Voting classification

In [2]:
TEST_SIZE = 0.05
Result = namedtuple('Result', ['confusion_matrix', 'accuracy', 'model'])


def predict(X, y, models, results=None, save_model=True):
    """Split X and y to trains and test datasets.
      Fit and save the model(with parameters) to the local machine
      Args:
          X (numpy or Dataframe): Independent dataset.
          y (numpy or Dataframe): Dependent dataset.
          models: a list of classifiers to be combined to a voting classifier
          test_size: Ratio of test set against the whole dataset.
    """
    if not results:
        results = []
    # Nested cross validation
    rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=5)
    # Create a voter classifier object out of prebuilt ones
    vt_classifier = VotingClassifier(estimators=[(model.__class__.__name__, model) for model in models], voting='hard')
    numerical_features = X.select_dtypes(include=np.number).columns.tolist()
    categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()
    # We create two small pipelines for both numeric and categorical features:
    numeric_pipeline = Pipeline(steps=[
        ('impute', KNNImputer()),
        ('scale', StandardScaler())
    ])

    categorical_pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ])

    full_processor = ColumnTransformer(transformers=[
        ('number', numeric_pipeline, numerical_features),
        ('category', categorical_pipeline, categorical_features)
    ])
    for train_index, test_index in rskf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        vt_pipeline = Pipeline(steps=[
            ('preprocess', full_processor),
            ('model', vt_classifier)
        ])
        vt_pipeline.fit(X_train, y_train)
        y_pred = vt_pipeline.predict(X_test)
        conf = confusion_matrix(y_test, y_pred, labels=y_test.unique(), normalize='true')
        acc = accuracy_score(y_pred, y_test)
        results.append(Result(conf, acc, vt_classifier))
        # Save the results
        if save_model:
            with open('results.pkl', 'wb') as f:
                pickle.dump(results, f)


sampling_fraction = 0.1
classifiers = [SVC(), DecisionTreeClassifier(), SGDClassifier()]
data_path = "D:\BitBucket/5280\Project\Data\Engineered"
df = pd.read_parquet(Path(data_path) / 'combined_df_no_hw.parquet')
# Stratified sampling
df = df.groupby('target', group_keys=False).apply(lambda x: x.sample(frac=sampling_fraction))
y = df['target']
X = df.drop(columns=['target'])
# predict(X, y, classifiers)