Feature scaling2
===============
https://en.wikipedia.org/wiki/Feature_scaling

http://sebastianraschka.com/Articles/2014_about_feature_scaling.html

http://stats.stackexchange.com/questions/41704/how-and-why-do-normalization-and-feature-scaling-work

In [None]:
import csv
import numpy as np
import scipy.linalg as la
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.cross_validation import train_test_split
from sklearn.utils import shuffle
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA

import plotly.offline as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs
# download_plotlyjs('https://cdn.plot.ly/plotly-latest.min.js')
py.init_notebook_mode()

defaultScatterMarker=dict(
    size=10,
    colorscale='Viridis',
    opacity=0.5
)

# np.set_printoptions(precision=7, suppress=True, threshold=np.nan)
np.set_printoptions(formatter={'float': lambda x: "{0:0.10f}".format(x)})

In [None]:
def plotScatter(X, y):
    pca = PCA(n_components=2)
    pca.fit(X)

    print(pca.explained_variance_ratio_)

    X_train_pca = pca.transform(X).T
    y_train_num = [families.index(x) for x in y]
    print(X_train_pca.shape)
    
    trace1 = go.Scatter(x=X_train_pca[0], y=X_train_pca[1], #z=X_train_pca[2],
        text=y, mode='markers', marker={**defaultScatterMarker, 'color':y_train_num}
    )

    layout = go.Layout(
#         margin=dict(l=0, r=0, b=0, t=0),
#         width=780,
#         height=600,
    )
    
    fig = go.Figure(data=[trace1], layout=layout)
    py.iplot(fig)

def plotScatter3d(X, y):
    pca = PCA(n_components=3)
    pca.fit(X)
    print(pca.explained_variance_ratio_)

    X_train_pca = pca.transform(X).T
    y_train_num = [families.index(x) for x in y]
    print(X_train_pca.shape)
    
    trace1 = go.Scatter3d(x=X_train_pca[0], y=X_train_pca[1], z=X_train_pca[2],
        text=y, mode='markers', marker={**defaultScatterMarker, 'color':y_train_num}
    )
    
    layout = go.Layout(
         margin=dict(l=0, r=0, b=0, t=0),
    )

    fig = go.Figure(data=[trace1], layout=layout)
    py.iplot(fig)

In [None]:
def evaluateKnn(X_train, y_train, X_test, y_test, k=1):
    knn = KNeighborsClassifier(k)
    knn.fit(X_train, y_train)
    
    predicted = knn.predict(X_test)
    print( sum(predicted == y_test)/len(y_test)*100, '% success' )
    return sum(predicted == y_test)/len(y_test)*100

In [None]:
import glob, os

families = []
files = []
for file in glob.glob("data/sample_families/*.csv"):
    files.append(file)
    families.append(file[21:])
# files.remove('data/sample_families\\cleanpup_plus_prev.csv')
print(files)

In [None]:
data = [ pd.read_csv(x, sep=';') for x in files ]
familiesCounts = [x.shape[0] for x in data]
df = pd.concat(data)
df = df.drop(df.columns[-1], axis=1)

#print(df.columns)
pd.set_option('display.max_columns', None)
print(df.shape)
df.describe()

targets = np.array([ x for f,c in zip(families, familiesCounts) for x in [f]*c])
print(targets.shape)

In [None]:
data = [
    go.Histogram(
        x=targets
    )
]
py.iplot(data)

In [None]:
numericHeaders = ['subsys', 'sects', 'res_cnt', 'imp_cnt', 'file_size', 'insns', 'eff_insns', 'code_rank', 'mem_w', 'w_size', 'mem_r'
                  , 'r_size', 'exc_cnt', 'api_cnt', 'alc_rat', 'rat_ari', 'rat_stack', 'rat_comp', 'rat_codefl', 'rat_assign', 'rat_str'
                  , 'rat_oth', 'rat_fpu', 'rat_reg8', 'rat_reg16', 'rat_reg32', 'rat_mem8', 'rat_mem16', 'rat_mem32', 'rat_imm8', 'rat_imm16'
                  , 'rat_imm32', 'cnt_qtst', 'cnt_qmod', 'cnt_dreg']

D = len(numericHeaders)
df = df.loc[:, numericHeaders]
print(df.shape)

In [None]:
df.describe()

Missing values
==============

In [None]:
df.isnull().sum(axis=0)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, targets, train_size=0.7, stratify=targets)

Filling missing values
=====

In [None]:
X_train_filled = X_train.fillna(X_train.max())
X_test_filled = X_test.fillna(X_train.max())

Original dataset
================

In [None]:
evaluateKnn(X_train_filled, y_train, X_test_filled, y_test, k=1)
plotScatter(X_train_filled, y_train)

Standardization
=========

In [None]:
X_train_znorm = (X_train_filled - X_train_filled.mean()) / X_train_filled.std()
X_train_znorm.describe()

X_test_znorm = (X_test_filled - X_train_filled.mean()) / X_train_filled.std()

In [None]:
evaluateKnn(X_train_znorm, y_train, X_test_znorm, y_test)
plotScatter(X_train_znorm, y_train)

Rescaling
=========

In [None]:
X_train_minmax = (X_train_filled - X_train_filled.min()) / (X_train_filled.max() - X_train_filled.min())
X_train_minmax.describe()

X_test_minmax = (X_test_filled - X_train_filled.min()) / (X_train_filled.max() - X_train_filled.min())

In [None]:
evaluateKnn(X_train_minmax, y_train, X_test_minmax, y_test)
plotScatter(X_train_minmax, y_train)

Scaling to unit length: L1 norm
=====

In [None]:
X_train_l1norm = X_train_filled.div(X_train_filled.abs().sum(axis=1), axis=0)
# print(X_train_l1norm.iloc[0, :])
X_train_l1norm.describe()

X_test_l1norm = X_test_filled.div(X_test_filled.abs().sum(axis=1), axis=0)

In [None]:
evaluateKnn(X_train_l1norm, y_train, X_test_l1norm, y_test)
plotScatter(X_train_l1norm, y_train)

Scaling to unit length: L2 norm
=====

In [None]:
from sklearn import preprocessing
X_train_l2norm = X_train_znorm.div(np.sqrt(np.square(X_train_znorm).sum(axis=1)), axis=0)
# print(X_train_l1norm.iloc[0, :])
X_train_l2norm.describe()

X_test_l2norm = X_test_znorm.div(np.sqrt(np.square(X_test_znorm).sum(axis=1)), axis=0)

In [None]:
evaluateKnn(X_train_l2norm, y_train, X_test_l2norm, y_test)
plotScatter(X_train_l2norm, y_train)

In [None]:
# results = []
# for k in range(1,50):
#     results.append( evaluateKnn(X_train_l2norm, y_train, X_test_l2norm, y_test, k=k) )

In [None]:
results = [99.597837676062923, 99.405173865060519, 99.40143282018667, 99.277978339350184, 99.22560371111652, 99.171358560445938, 99.13394811170761, 99.055386169357106, 98.96186004751128, 98.926320121209855, 98.881427582723859, 98.860851835917771, 98.817829819868692, 98.7186921307121, 98.709339518527528, 98.670058547352284, 98.593367127438697, 98.52976936458353, 98.51667570752511, 98.462430556854528, 98.417538018368532, 98.400703316436278, 98.395091749125527, 98.36329286769795, 98.236097341987616, 98.232356297113782, 98.155664877200195, 98.168758534258615, 98.168758534258615, 98.155664877200195, 98.138830175267955, 98.11638390602495, 98.086455547034291, 98.084585024597374, 98.069620845102037, 98.058397710480534, 98.058397710480534, 98.047174575859046, 97.983576813003864, 97.957389498887039, 97.934943229644048, 97.903144348216458, 97.875086511662715, 97.86573389947813, 97.85264024241971, 97.830193973176705, 97.794654046875294, 97.747890985952381, 97.645012251921955]

In [None]:
py.iplot([go.Scatter(
    x = list(range(1,50)),
    y = results,
    mode = 'lines+markers',
    name = 'kNN classifier'
)])

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=1000, criterion='gini', max_depth=None, min_samples_split=2,
                            min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto',
                            max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=4, random_state=None,
                            verbose=0, warm_start=False, class_weight=weights)
rf.fit(X_train_filled, y_train)
print(rf.score(X_test_filled, y_test))

In [None]:
print(familiesCounts)
print(y_test)

weights = {}
for l,c in zip(families, familiesCounts):
    print(l,c)
    weights[l] = c
print(weights)

In [None]:
print(confusion_matrix(y_test, rf.predict(X_test_filled)))