In [None]:
import numpy as np
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import preprocessing
from scipy import stats
import pandas as pd
from scipy.io.arff import loadarff 
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
import time
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import KFold
from numpy import mean
from category_encoders import TargetEncoder
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')
from random import randint
from sklearn.model_selection import RandomizedSearchCV
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier, Perceptron, Ridge, Lasso, ElasticNet, SGDClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

In [None]:
def get_all_data(path, features_encoder, size_training, scaler): 
    dataframe = pd.read_csv(path)
    #the label has to be the last column in the file!
    labels = dataframe.iloc[: , -1]
    features = dataframe.iloc[:, :-1]
    le = preprocessing.LabelEncoder()
    labels = le.fit_transform(labels)
    print(features.shape)
    if(features_encoder is None):
        features = features._get_numeric_data()
    else:
        cols = features.columns
        print('categorical features', cols)
        numerical_cols = features._get_numeric_data().columns
        categorical_cols = list(set(cols) - set(numerical_cols))
        
        if(isinstance(features_encoder, OneHotEncoder)):
            feature_arr = features_encoder.fit_transform(features[categorical_cols])
            feature_labels =  features_encoder.get_feature_names(categorical_cols)
            encoded_features = pd.DataFrame(feature_arr.toarray(), columns=feature_labels)
            
        if(isinstance(features_encoder, OrdinalEncoder)):
            feature_arr = features_encoder.fit_transform(features[categorical_cols])
            encoded_features = pd.DataFrame(feature_arr, columns=categorical_cols)
            
        if(isinstance(features_encoder, TargetEncoder)):
            transform = features_encoder.fit_transform(features[categorical_cols].iloc[:size_training], labels[:size_training])
            training_encoded = pd.DataFrame(transform, columns=categorical_cols)
            testing_encoded = pd.DataFrame(features_encoder.transform(features[categorical_cols].iloc[size_training:len(features)]), columns=categorical_cols)
            encoded_features = training_encoded.append(testing_encoded)
            
        features = features._get_numeric_data().join(encoded_features)
        
    if(scaler is True):
        scaler = MinMaxScaler()
        features_training = scaler.fit_transform(features.iloc[:size_training])
        features_testing = scaler.transform(features.iloc[size_training:len(features)])
        features_training_df = pd.DataFrame(features_training, columns=features.columns)
        features_testing_df = pd.DataFrame(features_testing, columns=features.columns)
        features = features_training_df.append(features_testing_df)
        
    return features,labels

In [None]:
def get_training_data(features, labels, size_training, scaler): 
    training_features = features.iloc[:size_training]
    training_labels = labels[:size_training]
    return training_features, training_labels

In [None]:
def learn_classifier(training_features, training_labels, classifier): 
    classifier.fit(training_features, training_labels)
    return classifier

In [None]:
# #https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
# def find_best_parameters_random_forest(training_features, training_labels):
    
#     # Number of trees in random forest
#     n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 100)]
#     # Maximum number of levels in tree
#     max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
#     # Method of selecting samples for training each tree
#     bootstrap = [True, False]
#     # Create the random grid
#     random_grid = {'n_estimators': n_estimators,
#                    'max_depth': max_depth,
#                    'bootstrap': bootstrap}
#     #print(random_grid)
#     rf = RandomForestClassifier()
#     rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, random_state=42, n_jobs = -1)
#     # Fit the random search model
#     rf_random.fit(training_features, training_labels)
#     print('best parameters', rf_random.best_params_)
#     return rf_random.best_estimator_
    

In [None]:
def compute_score_training_set(model, training_features, training_labels, length_test_set, type_splitting):
    countEvents = len(training_features)
    train_indices = []
    test_indices = []
    cv = []
    number_runs = 20
    accuracy_list = []
    for i in range(number_runs):
        #sequential splitting method
        if(type_splitting == 0):
            start = randint(0, countEvents - number_runs)
            x_test = pd.DataFrame(training_features[start:start + length_test_set], columns = training_features.columns)
            y_test = pd.DataFrame(training_labels[start:start + length_test_set])

            x_train_before = pd.DataFrame(training_features[0:start], columns=training_features.columns)
            x_train_after = pd.DataFrame(training_features[start + length_test_set:countEvents], columns = training_features.columns)
            x_train = x_train_before.append(x_train_after)

            y_train_before = pd.DataFrame(training_labels[0:start])
            y_train_after = pd.DataFrame(training_labels[start + length_test_set:countEvents])
            y_train = y_train_before.append(y_train_after)
        
        #time-based splitting
        if(type_splitting == 1):
            start_testing = randint(int((countEvents - number_runs) * 0.5), int((countEvents - number_runs) * 0.9))
            x_test = pd.DataFrame(training_features[start_testing:], columns = training_features.columns)
            y_test = pd.DataFrame(training_labels[start_testing:])

            x_train = pd.DataFrame(training_features[0:start_testing], columns = training_features.columns)
            y_train = pd.DataFrame(training_labels[0:start_testing])

        results = model.fit(x_train, y_train)
        predicted_labels = model.predict(x_test)

        accuracy = accuracy_score(y_test, predicted_labels)
        accuracy_list.append(accuracy)

    training_mean = np.mean(accuracy_list)
    training_std = np.std(accuracy_list)
    return [training_mean, training_std]

In [None]:
def check_batches(classifier, features, labels, scores_training_set, size_training, size_dataset, size_batch):
    detected_batches = []
    detected = False
    training_mean = scores_training_set[0]
    training_std = scores_training_set[1]
    batch_scores = []
    total_batches = int((size_dataset - size_training) / size_batch)
    print('training mean:', training_mean, ', training std:', training_std)
    for i in range(size_training, size_dataset, size_batch):
        batch = features[i:i + size_batch]
        batch_labels = labels[i:i+size_batch]
        batch_score = classifier.score(batch, batch_labels)
        batch_scores.append(batch_score)
        batch_number = int((i - size_training) / size_batch + 1)
        if(training_mean - 1.00 * training_std > batch_score):
            detected_batches.append(batch_number)
    print('detected batches:', detected_batches)
    
    #draw graph with accuracies of each batch
    batch_scores = np.array(batch_scores)
    fig, ax = plt.subplots()
    indices = np.arange(1, total_batches + 1)
    plt.axhline(y = training_mean, label='mean', color='red')
    plt.axhline(y = training_mean - 1.00 * training_std, label='mean - 1 std', color='grey')
    plt.xlabel('Batch Number')
    plt.ylabel('Accuracy')
    plt.ylim(0.5,1)
    colors = []
    for score in batch_scores:
        if(score >= training_mean - 1.00 * training_std):
            colors.append('green')
        else:
            colors.append('red')
    ax.bar(indices, batch_scores, color = colors)
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    plt.show()
    print()

    return detected_batches

In [None]:
def get_concept_drift(path, size_dataset, size_training, size_batch, features_encoder, scaler, type_splitting):
    features, labels = get_all_data(path, features_encoder, size_training, scaler)
    training_features, training_labels = get_training_data(features, labels, size_training, scaler)
    #tried, bad for 'spam': svm.LinearSVC(), LogisticRegression(), PassiveAggressiveClassifier(),  Perceptron(), Lasso(), ElasticNet() SGDClassifier() MultinomialNB(), svm.SVR() ,  BernoulliNB(alpha=.001)
    
    #spam data set
    models = [RandomForestClassifier(n_estimators = 1000, max_depth = 40, bootstrap = False)]
#     model weather = [svm.SVC(random_state=42, C=2.09)]
#     model elect2 and airline = [KNeighborsClassifier(n_neighbors=100)]
    for model in models:
        print('model:', model.__class__.__name__)
        classifier = learn_classifier(training_features, training_labels, model)
        scores_training_set = compute_score_training_set(classifier, training_features, training_labels, size_batch, type_splitting)
        check_batches(classifier, features, labels, scores_training_set, size_training, size_dataset, size_batch)

In [None]:
def run_test(path, size_dataset, size_training, size_batch, features_encoder, scaler, type_splitting):
    start = time.time()
    #adjust size of dataset so that all batches have equal size
    size_dataset = size_training + int((size_dataset - size_training)/ size_batch) * size_batch
    print('dataset:', path)
    print('size dataset: ' + str(size_dataset) + ', size training: ' + str(size_training) + ', size testing batch: ' + str(size_batch))
    if(features_encoder is not None):
        print('categorical features encoder', features_encoder.__class__.__name__)
    if(scaler is True):
        print('features scaled using MinMaxScaler')
    else:
        print('features are not scaled')
    if(type_splitting == 0):
        print('sequential splitting method')
    else:
        print('time-based splitting method')
    print()
    predicted_batches = []  
    number_testing_batches = (size_dataset - size_training) / size_batch
    get_concept_drift(path, size_dataset, size_training, size_batch, features_encoder, scaler, type_splitting)
    end = time.time()
    print("duration of test: " + str(int((end - start) / 60)) + ' minutes')
    print()
    print()
    

In [None]:
#SPAM
#RandomForestClassifier(n_estimators = 1000, max_depth = 40, bootstrap = False)

#size batch is 100, sequential splitting method (last parameter = 0)
run_test('real-world/spam_dataset.csv', 4405, 1468, 100, None, False, 0)
#size batch is 100, time-based splitting method (last parameter = 1)
run_test('real-world/spam_dataset.csv', 4405, 1468, 100, None, False, 1)

#size batch is 50, sequential splitting method
run_test('real-world/spam_dataset.csv', 4405, 1468, 50, None, False, 0)
#size batch is 50, time-based splitting method
run_test('real-world/spam_dataset.csv', 4405, 1468, 50, None, False, 1)

#size batch is 20, sequential splitting method
run_test('real-world/spam_dataset.csv', 4405, 1468, 20, None, False, 0)
#size batch is 20, time-based splitting method
run_test('real-world/spam_dataset.csv', 4405, 1468, 20, None, False, 1)

In [None]:
#ELECT2
#KNeighborsClassifier(n_neighbors=100)

#size batch  365, sequential splitting method
run_test('real-world/electricity_dataset.csv', 45312, 15104, 365, None, True, 0)
#size batch is 365, time-based splitting method
run_test('real-world/electricity_dataset.csv', 45312, 15104, 365, None, True, 1)

In [None]:
# #WEATHER
# # SVC(random_state=random_state, C=2.09)

#size batch is 365, sequential splitting method 
run_test('real-world/weather_dataset.csv', 18159, 6053, 365, None, True, 0)
#size batch is 365, time-based splitting method 
run_test('real-world/weather_dataset.csv', 18159, 6053, 365, None, True, 1)

#size batch is 30, sequential splitting method 
run_test('real-world/weather_dataset.csv', 18159, 6053, 30, None, True, 0)
#size batch is 30, time-based splitting method 
run_test('real-world/weather_dataset.csv', 18159, 6053, 30, None, True, 1)


In [None]:
# #AIRLINE
#KNeighborsClassifier(n_neighbors=100)

# #size batch is 17000, sequential splitting method 
run_test('real-world/airline_dataset.csv', 539383, 179794, 17000, OneHotEncoder(), True, 0)
# #size batch is 17000, time-based splitting method 
run_test('real-world/airline_dataset.csv', 539383, 179794, 17000, OneHotEncoder(), True, 1)

# #size batch is 17000, sequential splitting method 
run_test('real-world/airline_dataset.csv', 539383, 179794, 17000, OrdinalEncoder(), True, 0)
# #size batch is 17000, time-based splitting method 
run_test('real-world/airline_dataset.csv', 539383, 179794, 17000, OrdinalEncoder(), True, 1)

# #size batch is 17000, sequential splitting method 
run_test('real-world/airline_dataset.csv', 539383, 179794, 17000, TargetEncoder(), True, 0)
# #size batch is 17000, time-based splitting method 
run_test('real-world/airline_dataset.csv', 539383, 179794, 17000, TargetEncoder(), True, 1)

