In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from matplotlib import pyplot


def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)


def rank_feature_importance(m, type="Logistic Regression"):
  if type == "Random Forest":
    importance = m.feature_importances_
  else:
    importance = m.coef_[0]

  for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
  # plot feature importance
  pyplot.bar([x for x in range(len(importance))], importance)
  pyplot.show()

In [2]:
df = pd.read_csv('dataset_sdn.csv')
del df['src']
del df['dst']
df.head()

FileNotFoundError: ignored

In [None]:
df.describe()

In [None]:
one_hot = pd.get_dummies(df['Protocol'])
df = df.drop('Protocol',axis = 1)
df = df.join(one_hot)


In [None]:
assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
df.dropna(inplace=True)
indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)

df = df[indices_to_keep].astype(np.float64)

In [None]:
x = df.drop('label',axis = 1)
y = df.label
x_train, x_test, y_train, y_test = train_test_split(x, y)

Normalize the data 

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaling = MinMaxScaler(feature_range=(-1,1)).fit(x_train)
x_train = scaling.transform(x_train)
x_test = scaling.transform(x_test)

In [None]:
logistic_regression = LogisticRegression(max_iter=10000)
logistic_regression.fit(x_train, y_train)

In [None]:
rank_feature_importance(logistic_regression)

In [None]:
y_pred = logistic_regression.predict(x_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
accuracy_percentage = 100 * accuracy
print("Logistic Regression Accuracy:", accuracy_percentage)


In [None]:
from sklearn import svm 
clf = svm.SVC(kernel='linear', verbose=3)
clf.fit(x_train, y_train)

In [None]:
print(clf.coef_[0])

In [None]:
rank_feature_importance(clf)

In [None]:
y_pred = clf.predict(x_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
accuracy_percentage = 100 * accuracy
print("SVM Accuracy:", accuracy_percentage)


In [None]:
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(x_train, y_train)

y_pred=clf.predict(x_test)
feat_importances = pd.Series(model.feature_importances_, index=x.columns)
feat_importances.plot(kind='barh')

In [None]:
print(model.feature_importances_, len(model.feature_importances_))
print(x.columns, len(x.columns))

In [None]:
rank_feature_importance(clf, "Random Forest")

In [None]:
accuracy = metrics.accuracy_score(y_test, y_pred)

accuracy_percentage = 100 * accuracy
print("Random Forest Accuracy:", accuracy_percentage)

In [None]:
# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_pred))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_pred))

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(x_train, y_train)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=x.columns)
feat_importances.plot(kind='barh')


In [None]:
print(model.feature_importances_, len(model.feature_importances_))
print(df.columns, len(df.columns))

In [None]:
y_pred = model.predict(x_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
accuracy_percentage = 100 * accuracy
print("ExtraTrees Classifier Accuracy :", accuracy_percentage)
