In [None]:
import itertools

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.pipeline import Pipeline
import joblib

""" from imblearn.over_sampling import SMOTE,SVMSMOTE,BorderlineSMOTE,KMeansSMOTE
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.pipeline import Pipeline """

- INT32:1
- INT64:2
- FLOAT:3
- DOUBLE:4

* GZIP:1
* LZ4:2
* SNAPPY:3(none)
* UNCOMPRESSED:4(none)

+ RLE:1
+ TS_2DIFF:2
+ GORILLA:3
+ PLAIN:4
+ RAKE:5
+ SPRINTZ:6
+ RLBE:7

In [None]:
""" label = ['GZIP+GORILLA','GZIP+PLAIN','GZIP+RAKE','GZIP+RLBE','GZIP+RLE','GZIP+SPRINTZ',
 'GZIP+TS_2DIFF','LZ4+TS_2DIFF','SNAPPY+SPRINTZ','SNAPPY+TS_2DIFF'] """
label = ['GORILLA','RLBE','RLE','SPRINTZ','TS_2DIFF','RAKE','PLAIN']
vnames = [
"DataType","Mean","Standard_variance","Spread","Delta_mean","Delta_variance","Delta_spread","Repeat","Increase"
    ]


In [None]:
data = pd.read_csv("./data/train.csv")
data.dropna(axis = 0,how="any",inplace=True)
data.info()
compressor = data["Compressor"].to_numpy()
encoding = data["Encoding"].to_numpy()
y = encoding
print(np.unique(y))
X = data[vnames].to_numpy()
X_train, y_train = X, y
#X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

testdata = pd.read_csv("./data/train_synthetic.csv")
testdata.dropna(axis = 0,how="any",inplace=True)
encoding_t = testdata["Encoding"].to_numpy()
y_test = encoding_t
X_test = testdata[vnames].to_numpy()


In [None]:
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True,
                          save = "result.eps"):

    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(5, 4))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    #plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.xlabel('Predicted label')
    plt.savefig(save,format='eps',dpi = 40,bbox_inches='tight')
    plt.show()

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score, f1_score

def print_metrices(pred, true):
    print(confusion_matrix(true, pred))
    print(classification_report(true, pred, target_names=label,digits=4))
"""     print("Weighted Precison : ", precision_score(
        true, pred, average='weighted',labels=label))
    print("Weighted Recall : ", recall_score(true, pred,  average='weighted',labels=label))
    print("F1 : ", f1_score(true, pred, average='weighted',labels=label))
 """

In [None]:
pipeline =pipeline = Pipeline([
    ('lr', LogisticRegression(class_weight='balanced'))
])

param_dist = {  'lr__penalty': ['l2', 'elasticnet'],
                'lr__C':np.logspace(-3,3,7)}
#param_dist = {}
grid = GridSearchCV(pipeline, param_dist, verbose=2,
                    refit=True, cv=3, n_jobs=-1,scoring="f1_weighted")
grid.fit(X_train, y_train)
print(grid.best_params_, grid.best_score_)
print('The accuracy of best model in LogisticRegression set is', grid.score(X_test, y_test))

pred = grid.predict(X_test)
print_metrices(pred, y_test)
plot_confusion_matrix(confusion_matrix(y_test, pred), target_names=label, normalize=False,
                      title='Confusion matix of LogisticRegression on test data')


In [None]:
pipeline =pipeline = Pipeline([
    ('lr', SVC())
])

param_dist = {'lr__C':np.logspace(-3,3,7)}
#param_dist = {}
grid = GridSearchCV(pipeline, param_dist, verbose=2,
                    refit=True, cv=3, n_jobs=-1,scoring="f1_weighted")
grid.fit(X_train, y_train)
print(grid.best_params_, grid.best_score_)
print('The accuracy of best model in LogisticRegression set is', grid.score(X_test, y_test))

pred = grid.predict(X_test)
print_metrices(pred, y_test)
plot_confusion_matrix(confusion_matrix(y_test, pred), target_names=label, normalize=False,
                      title='Confusion matix of LogisticRegression on test data')


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

pipeline = Pipeline([
    ('dt', DecisionTreeClassifier())
])

param_dist = {  'dt__criterion': ['gini','entropy'],
                'dt__max_depth':[1,2,3,4,5,6,7,8,9,None]}
#param_dist ={}
grid = GridSearchCV(pipeline, param_dist, verbose=2,
                    refit=True, cv=5, n_jobs=-1,scoring="f1_weighted")
grid.fit(X_train, y_train)
print(grid.best_params_, grid.best_score_)
print('The accuracy of best model in SVC set is', grid.score(X_test, y_test))

pred = grid.predict(X_test)
print_metrices(pred, y_test)
plot_confusion_matrix(confusion_matrix(y_test, pred), target_names=label, normalize=False,
                      title='Confusion matix of Decision Tree on test data',save="/home/srt_2022/client-py/image/dt.eps")

In [None]:
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline([
    ('rf', RandomForestClassifier(n_estimators=300,criterion="entropy"))
])

#param_dist = {'rf__n_estimators':range(250,350,10)}
#param_dist = {  'rf__criterion': ['entropy'], 'rf__n_estimators':[300]}
param_dist ={}
grid = GridSearchCV(pipeline, param_dist, verbose=2,
                    refit=True, cv=2, n_jobs=-1,scoring="f1_weighted")
grid.fit(X_train, y_train)
print(grid.best_params_, grid.best_score_)
print('The accuracy of best model in RandomForest set is', grid.score(X_test, y_test))

pred = grid.predict(X_test)
print_metrices(pred, y_test)
plot_confusion_matrix(confusion_matrix(y_test, pred), target_names=label, normalize=False,
                      title='Confusion matix of Random Forest on synthetic data',save="/home/srt_2022/client-py/image/rf.eps")
joblib.dump(grid, 'rf.model')

In [None]:
pipeline = Pipeline([
    ('gbc', GradientBoostingClassifier(n_estimators=100,max_depth=7))
])

#param_dist = {  'gbc__n_estimators': range(10,100,10),'gbc__max_depth': [2,3,4,5,6,7,8,None]}
param_dist = {}

grid = GridSearchCV(pipeline, param_dist, verbose=2,
                    refit=True, cv=2,n_jobs=-1,scoring="f1_weighted")
grid.fit(X_train, y_train)
print(grid.best_params_, grid.best_score_)
print('The accuracy of best model in BalancedBagging set is',
      grid.score(X_test, y_test))

pred = grid.predict(X_test)
print_metrices(pred, y_test)
plot_confusion_matrix(confusion_matrix(y_test, pred), target_names=label, normalize=False,
                      title='Confusion matix of Gradient Boosting on test data',save="/home/srt_2022/client-py/image/gdbt.eps")
joblib.dump(grid, 'gdbt.model')


In [None]:
from sklearn.neural_network import MLPClassifier

pipeline = Pipeline([
    ('gbc', MLPClassifier(hidden_layer_sizes=80,activation='logistic',alpha=0.1))
])

#param_dist = {  'gbc__hidden_layer_sizes': range(10,100,5),
# 'gbc__activation' : ['identity', 'logistic', 'tanh', 'relu']}
param_dist = {'gbc__alpha' : np.logspace(-4,4,9)}
#param_dist = {}


grid = GridSearchCV(pipeline, param_dist, verbose=2,
                    refit=True, cv=3, n_jobs=-1,scoring="f1_weighted")
grid.fit(X_train, y_train)
print(grid.best_params_, grid.best_score_)
print('The accuracy of best model in BalancedBagging set is',
      grid.score(X_test, y_test))

pred = grid.predict(X_test)
print_metrices(pred, y_test)
plot_confusion_matrix(confusion_matrix(y_test, pred), target_names=label, normalize=False,
                      title='Confusion matix of MLP on val data')
