In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind,chi2_contingency,levene,MonteCarloMethod,PermutationMethod,shapiro,f_oneway,kruskal,pearsonr,spearmanr

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
sns.set_style('darkgrid')
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)  # Ensures that the display width isn't restricted
pd.set_option('display.max_rows', None)

In [5]:
df = pd.read_csv('Network_anomaly_data.csv')
df['is_attack'] = df['attack'].apply(lambda x: 'normal' if x=='normal' else 'attack')
df.head()

Unnamed: 0,duration,protocoltype,service,flag,srcbytes,dstbytes,land,wrongfragment,urgent,hot,numfailedlogins,loggedin,numcompromised,rootshell,suattempted,numroot,numfilecreations,numshells,numaccessfiles,numoutboundcmds,ishostlogin,isguestlogin,count,srvcount,serrorrate,srvserrorrate,rerrorrate,srvrerrorrate,samesrvrate,diffsrvrate,srvdiffhostrate,dsthostcount,dsthostsrvcount,dsthostsamesrvrate,dsthostdiffsrvrate,dsthostsamesrcportrate,dsthostsrvdiffhostrate,dsthostserrorrate,dsthostsrvserrorrate,dsthostrerrorrate,dsthostsrvrerrorrate,attack,lastflag,is_attack
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,150,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20,normal
1,0,udp,other,SF,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,1,0.0,0.0,0.0,0.0,0.08,0.15,0.0,255,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15,normal
2,0,tcp,private,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,123,6,1.0,1.0,0.0,0.0,0.05,0.07,0.0,255,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19,attack
3,0,tcp,http,SF,232,8153,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,5,5,0.2,0.2,0.0,0.0,1.0,0.0,0.0,30,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21,normal
4,0,tcp,http,SF,199,420,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,30,32,0.0,0.0,0.0,0.0,1.0,0.0,0.09,255,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21,normal


In [6]:
# land, wrongfragment,urgent, numfailedlogins,loggedin,rootshell,suattempted,numshells,ishostlogin,isguestlogin
df['land'] = df['land'].astype('category')
df['wrongfragment'] = df['wrongfragment'].astype('category')
df['urgent'] = df['urgent'].astype('category')
df['numfailedlogins'] = df['numfailedlogins'].astype('category')
df['loggedin'] = df['loggedin'].astype('category')
df['rootshell'] = df['rootshell'].astype('category')
df['suattempted'] = df['suattempted'].astype('category')
df['numshells'] = df['numshells'].astype('category')
df['ishostlogin'] = df['ishostlogin'].astype('category')
df['isguestlogin'] = df['isguestlogin'].astype('category')

In [7]:
def permutation_test_independence_cat(obs_table, num_permutations=10000):
    # observed stat
    observed_stat, _, _, _ = chi2_contingency(obs_table)
    num_rows, num_cols = obs_table.shape

    # permutations
    permuted_stats = []
    for _ in range(num_permutations):
        # Shuffling
        permuted_table = np.copy(obs_table)
        for i in range(num_cols):
            np.random.shuffle(permuted_table[:, i])

        # stat on permutated table
        try:
            permuted_stat, _, _, _ = chi2_contingency(permuted_table)
            permuted_stats.append(permuted_stat)
        except ValueError:
            continue

    # mean stat
    permuted_stats = np.array(permuted_stats)
    p_value = np.mean(permuted_stats >= observed_stat)
    avg_stat = np.mean(permuted_stats)

    return p_value,avg_stat


In [15]:
def epsilon_squared_func(t,N,k):
  epsilon_squared = (t)**2 / ((t**2) +(N - k))
  return epsilon_squared

In [9]:
def check_homogenitity(df,cat_var,numeric_var):
  groups = [df[df[cat_var] == group][numeric_var] for group in df[cat_var].unique()]
  stat, p_value = levene(*groups)
  return p_value

In [10]:
def check_normality_no_sample(df,cat_var,numeric_var):
  groups = df[cat_var].unique()
  normality_results = dict()

  for group in groups:
    data = df[df[cat_var] == group][numeric_var]
    if len(data) >= 5000:
      data = data.sample(200)
    else:
      data = data.sample(200,replace=True)
    stats,p_value = shapiro(data)
    normality_results[group] = p_value
  return normality_results

In [25]:
def cramers_v(chi2, n, contingency_table):
    phi2 = chi2 / n
    r, k = contingency_table.shape
    phi2corr = max(0, phi2 - (k-1)*(r-1) / (n-1))
    k_corr = min(k-1, r-1)
    return np.sqrt(phi2corr / k_corr)

In [26]:
alpha = 0.05
tst_cols = df.drop(['attack','is_attack','numoutboundcmds'],axis=1).columns
binary_test = pd.DataFrame(columns=['var1','var2','var2type','levene_pval','test_type','pvalue','status','stat','effect'])

for col in tst_cols:
  col_type = df[col].dtype
  if col_type == 'int64' or col_type == 'float64':
    grp1 = df[df['is_attack'] == 'normal'][col]
    grp2 = df[df['is_attack'] != 'normal'][col]

    levene_stat, levene_p_value = levene(grp1, grp2)
    test_type = None
    equal_variance = True
    if levene_p_value > 0.05:
      test_type = 'Regular'
    else:
      test_type = 'Welch'
      equal_variance = False

    t_stat, pvalue = ttest_ind(grp1, grp2,equal_var=equal_variance)
    epsilon = epsilon_squared_func(t_stat,df.shape[0],df['attack'].nunique())
    if pvalue <= 0.05:
      binary_test.loc[len(binary_test)] = {'var1':'is_attack','var2':col,'var2type':'numeric','levene_pval':levene_p_value,'test_type':test_type,'pvalue':pvalue,'status':1,'stat':t_stat,'effect':epsilon}
    else:
      binary_test.loc[len(binary_test)] = {'var1':'is_attack','var2':col,'var2type':'numeric','levene_pval':levene_p_value,'test_type':test_type,'pvalue':pvalue,'status':0,'stat':t_stat,'effect':epsilon}

  else:
    ctbl = pd.crosstab(df['is_attack'],df[col])
    l5 = np.any(ctbl<5)
    if not l5:
      chi_stat, p_value, dof, exp_freq = chi2_contingency(ctbl)
      crammers_v = cramers_v(chi_stat,df.shape[0],ctbl)
      if p_value <= 0.05:
        binary_test.loc[len(binary_test)] = {'var1':'is_attack','var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr','pvalue':p_value,'status':1,'stat':chi_stat,'effect':crammers_v}
      else:
        binary_test.loc[len(binary_test)] = {'var1':'is_attack','var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr','pvalue':p_value,'status':0,'stat':chi_stat,'effect':crammers_v}
    else:
      p_value,avg_stat = permutation_test_independence_cat(ctbl)
      crammers_v = cramers_v(avg_stat,df.shape[0],ctbl)
      if p_value <= 0.05:
        binary_test.loc[len(binary_test)] = {'var1':'is_attack','var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr_Permutation','pvalue':p_value,'status':1,'stat':avg_stat,'effect':crammers_v}
      else:
        binary_test.loc[len(binary_test)] = {'var1':'is_attack','var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr_Permutation','pvalue':p_value,'status':0,'stat':avg_stat,'effect':crammers_v}

In [29]:
binary_test.head()

Unnamed: 0,var1,var2,var2type,levene_pval,test_type,pvalue,status,stat,effect
0,is_attack,duration,numeric,3.02785e-67,Welch,1.171722e-60,1,-16.445123,0.002143
1,is_attack,protocoltype,categorical,,Chi_Sqr,0.0,1,10029.248628,0.282132
2,is_attack,service,categorical,,Chi_Sqr_Permutation,0.1403,0,88459.792179,0.837654
3,is_attack,flag,categorical,,Chi_Sqr_Permutation,0.0381,1,52505.167816,0.645537
4,is_attack,srcbytes,numeric,0.03527225,Welch,0.04980998,1,-1.961633,3.1e-05


In [30]:
binary_test[binary_test['status'] == 1]['var2'].values

array(['duration', 'protocoltype', 'flag', 'srcbytes', 'land', 'hot',
       'loggedin', 'numcompromised', 'rootshell', 'numroot',
       'numfilecreations', 'numaccessfiles', 'isguestlogin', 'count',
       'serrorrate', 'srvserrorrate', 'rerrorrate', 'srvrerrorrate',
       'samesrvrate', 'diffsrvrate', 'srvdiffhostrate', 'dsthostcount',
       'dsthostsrvcount', 'dsthostsamesrvrate', 'dsthostdiffsrvrate',
       'dsthostsamesrcportrate', 'dsthostsrvdiffhostrate',
       'dsthostserrorrate', 'dsthostsrvserrorrate', 'dsthostrerrorrate',
       'dsthostsrvrerrorrate', 'lastflag'], dtype=object)

In [33]:
binary_test[binary_test['status'] == 1].sort_values('effect',ascending=False).head()

Unnamed: 0,var1,var2,var2type,levene_pval,test_type,pvalue,status,stat,effect
11,is_attack,loggedin,categorical,,Chi_Sqr,0.0,1,60002.601329,0.690149
3,is_attack,flag,categorical,,Chi_Sqr_Permutation,0.0381,1,52505.167816,0.645537
31,is_attack,dsthostsrvcount,numeric,0.0,Welch,0.0,1,384.440155,0.539899
27,is_attack,samesrvrate,numeric,0.0,Welch,0.0,1,383.923128,0.53923
32,is_attack,dsthostsamesrvrate,numeric,3.854622e-34,Welch,0.0,1,342.053264,0.481582


In [31]:
binary_test[binary_test['status'] == 0]['var2'].values

array(['service', 'dstbytes', 'wrongfragment', 'urgent',
       'numfailedlogins', 'suattempted', 'numshells', 'ishostlogin',
       'srvcount'], dtype=object)

In [34]:
def check_normality_no_sample(df,cat_var,numeric_var):
  groups = df[cat_var].unique()
  normality_results = dict()

  for group in groups:
    data = df[df[cat_var] == group][numeric_var]
    if len(data) >= 5000:
      data = data.sample(200)
    else:
      data = data.sample(200,replace=True)
    stats,p_value = shapiro(data)
    normality_results[group] = p_value
  return normality_results

In [35]:
alpha = 0.05

multi_test = pd.DataFrame(columns=['var1','var2','var2type','levene_pval','test_type','pvalue','status','stat','effect'])

for col in tst_cols:
  col_type = df[col].dtype
  if col_type == 'int64' or col_type == 'float64':
    rdict = check_normality_no_sample(df,'attack',col)
    normality_exists=True
    equal_vars = True
    for value in rdict.values():
      if value < 0.05:
        normality_exists = False
        break
    levene_pval = check_homogenitity(df,'attack',col)
    if levene_pval < 0.05:
      equal_vars = False

    if normality_exists and equal_vars:
      stat, p_value = f_oneway(*(df[df['attack'] == category][col] for category in df['attack'].unique()))
      if p_value < 0.05:
        status,effect = 1,1
        multi_test.loc[len(multi_test)] = {'var1':'attack','var2':col,'var2type':col_type,'normality':normality_exists,'homogenity':equal_vars,'test_type':'ANOVA','pvalue':p_value,'status':status,'stat':stat,'effect':effect}
      else:
        status,effect = 0,0
        multi_test.loc[len(multi_test)] = {'var1':'attack','var2':col,'var2type':col_type,'normality':normality_exists,'homogenity':equal_vars,'test_type':'ANOVA','pvalue':p_value,'status':status,'stat':stat,'effect':effect}

    else:
      groups = [group[col].values for _, group in df.groupby('attack')]
      #stat, p_value = kruskal(*(df[df['attack'] == category][col] for category in df['attack'].unique()))
      stat, p_value = kruskal(*groups)
      epsilon = epsilon_squared_func(stat,df.shape[0],df['attack'].nunique())
      if p_value < 0.05:
        status,effect = 1,1
        multi_test.loc[len(multi_test)] = {'var1':'attack','var2':col,'var2type':col_type,'normality':normality_exists,'homogenity':equal_vars,'test_type':'Kruskal','pvalue':p_value,'status':status,'stat':stat,'effect':epsilon}
      else:
        status,effect = 0,0
        multi_test.loc[len(multi_test)] = {'var1':'attack','var2':col,'var2type':col_type,'normality':normality_exists,'homogenity':equal_vars,'test_type':'Kruskal','pvalue':p_value,'status':status,'stat':stat,'effect':epsilon}
  else:
    ctbl = pd.crosstab(df['attack'],df[col])
    l5 = np.any(ctbl<5)
    if not l5:
      chi_stat, p_value, dof, exp_freq = chi2_contingency(ctbl)
      crammers_v = cramers_v(chi_stat,df.shape[0],ctbl)
      if p_value <= 0.05:
        multi_test.loc[len(multi_test)] = {'var1':'attack','var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr','pvalue':p_value,'status':1,'stat':chi_stat,'effect':crammers_v}
      else:
        multi_test.loc[len(multi_test)] = {'var1':'attack','var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr','pvalue':p_value,'status':0,'stat':chi_stat,'effect':crammers_v}
    else:
      p_value,avg_stat = permutation_test_independence_cat(ctbl)
      crammers_v = cramers_v(avg_stat,df.shape[0],ctbl)
      if p_value <= 0.05:
        multi_test.loc[len(multi_test)] = {'var1':'attack','var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr_Permutation','pvalue':p_value,'status':1,'stat':avg_stat,'effect':crammers_v}
      else:
        multi_test.loc[len(multi_test)] = {'var1':'attack','var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr_Permutation','pvalue':p_value,'status':0,'stat':avg_stat,'effect':crammers_v}


In [38]:
multi_test[multi_test['status'] == 1]['var2'].values

array(['duration', 'srcbytes', 'dstbytes', 'hot', 'numcompromised',
       'numroot', 'numfilecreations', 'numaccessfiles', 'count',
       'srvcount', 'serrorrate', 'srvserrorrate', 'rerrorrate',
       'srvrerrorrate', 'samesrvrate', 'diffsrvrate', 'srvdiffhostrate',
       'dsthostcount', 'dsthostsrvcount', 'dsthostsamesrvrate',
       'dsthostdiffsrvrate', 'dsthostsamesrcportrate',
       'dsthostsrvdiffhostrate', 'dsthostserrorrate',
       'dsthostsrvserrorrate', 'dsthostrerrorrate',
       'dsthostsrvrerrorrate', 'lastflag'], dtype=object)

In [39]:
multi_test[multi_test['status'] == 0]['var2'].values

array(['protocoltype', 'service', 'flag', 'land', 'wrongfragment',
       'urgent', 'numfailedlogins', 'loggedin', 'rootshell',
       'suattempted', 'numshells', 'ishostlogin', 'isguestlogin'],
      dtype=object)

In [41]:
cat_cols = df.select_dtypes(include=['object','category']).columns

In [42]:
categorical_df = pd.DataFrame(columns=['var1','var2','var2type','test_type','pvalue','status','stat','effect'])
visited = set()
for col1 in cat_cols:
  for col2 in cat_cols:
    if (col1 != col2) and (col2 not in visited):
      ctbl = pd.crosstab(df[col1],df[col2])
      l5 = np.any(ctbl<5)
      if not l5:
        chi_stat, p_value, dof, exp_freq = chi2_contingency(ctbl)
        crammers_v = cramers_v(chi_stat,df.shape[0],ctbl)
        if p_value <= 0.05:
          categorical_df.loc[len(categorical_df)] = {'var1':col1,'var2':col2,'var2type':'categorical','test_type':'Chi_Sqr','pvalue':p_value,'status':1,'stat':chi_stat,'effect':crammers_v}
        else:
          categorical_df.loc[len(categorical_df)] = {'var1':col1,'var2':col2,'var2type':'categorical','test_type':'Chi_Sqr','pvalue':p_value,'status':0,'stat':chi_stat,'effect':crammers_v}
      else:
        p_value,avg_stat = permutation_test_independence_cat(ctbl)
        #stat, p_value = permutation_test(df, col1, col2)
        crammers_v = cramers_v(avg_stat,df.shape[0],ctbl)
        if p_value <= 0.05:
          categorical_df.loc[len(categorical_df)] = {'var1':col1,'var2':col2,'var2type':'categorical','test_type':'Chi_Sqr_Permutation','pvalue':p_value,'status':1,'stat':avg_stat,'effect':crammers_v}
        else:
          categorical_df.loc[len(categorical_df)] = {'var1':col1,'var2':col2,'var2type':'categorical','test_type':'Chi_Sqr_Permutation','pvalue':p_value,'status':0,'stat':avg_stat,'effect':crammers_v}
  visited.add(col1)

In [43]:
categorical_df['result'] = None
for i in range(len(categorical_df)):
  cv = categorical_df.loc[i,'effect']
  if cv < 0.1:
    categorical_df.loc[i,'result'] = 'Weak'
  elif cv >= 0.1 and cv < 0.3:
    categorical_df.loc[i,'result'] = 'Moderate'
  elif cv >= 0.3 and cv < 0.5:
    categorical_df.loc[i,'result'] = 'Strong'
  else:
    categorical_df.loc[i,'result'] = 'Very Strong'

In [51]:
c1 = categorical_df[categorical_df['result'] == 'Very Strong'].sort_values('effect',ascending=False)
c1[np.logical_and(~c1.var1.isin(['attack','is_attack']),~c1.var2.isin(['attack','is_attack']))].shape

(36, 9)

In [52]:
num_cols = df.drop('numoutboundcmds',axis=1).select_dtypes(include=['int64','float64']).columns
visited = set()

pearsondf = pd.DataFrame(columns=['var1','var2','var2type','test_type','pvalue','status','stat','effect'])
for col1 in num_cols:
  for col2 in num_cols:
    if (col1 != col2) and (col2 not in visited):
      status1,status2 = 0,0
      pearson_corr, pearson_p = pearsonr(df[col1].dropna(), df[col2].dropna())

      if pearson_p <= 0.05:
        status1 = 1

      pearsondf.loc[len(pearsondf)] = {'var1':col1,'var2':col2,
                            'var2type':df[col2].dtype,'test_type':'pearson',
                           'pvalue':pearson_p,
                           'status':status1,'stat':pearson_corr,'effect':pearson_corr}
  visited.add(col1)

pearsondf['result'] = None
for i in range(len(pearsondf)):
  scorr = pearsondf.loc[i,'stat']

  if scorr >= 0.7:
    pearsondf.loc[i,'result'] = 'Strong'
  elif scorr >= 0.4 and scorr < 0.7:
    pearsondf.loc[i,'result'] = 'Moderate'
  else:
    pearsondf.loc[i,'result'] = 'Weak'

In [53]:
num_cols = df.drop('numoutboundcmds',axis=1).select_dtypes(include=['int64','float64']).columns
visited = set()

spearmandf = pd.DataFrame(columns=['var1','var2','var2type','test_type','pvalue','status','stat','effect'])
for col1 in num_cols:
  for col2 in num_cols:
    if (col1 != col2) and (col2 not in visited):
      status1,status2 = 0,0
      spearman_corr, spearman_p = spearmanr(df[col1].dropna(), df[col2].dropna())

      if spearman_p <= 0.05:
        status1 = 1

      spearmandf.loc[len(spearmandf)] = {'var1':col1,'var2':col2,
                            'var2type':df[col2].dtype,'test_type':'spearman',
                           'pvalue':spearman_p,
                           'status':status1,'stat':spearman_corr,'effect':spearman_corr}
  visited.add(col1)

spearmandf['result'] = None
for i in range(len(spearmandf)):
  scorr = spearmandf.loc[i,'stat']

  if scorr >= 0.7:
    spearmandf.loc[i,'result'] = 'Strong'
  elif scorr >= 0.4 and scorr < 0.7:
    spearmandf.loc[i,'result'] = 'Moderate'
  else:
    spearmandf.loc[i,'result'] = 'Weak'

In [61]:
spearmandf[np.logical_and(spearmandf['status'] == 1,spearmandf['result'] == 'Strong')].sort_values('effect',ascending=False).head()

Unnamed: 0,var1,var2,var2type,test_type,pvalue,status,stat,effect,result
225,serrorrate,srvserrorrate,float64,spearman,0.0,1,0.973119,0.973119,Strong
258,rerrorrate,srvrerrorrate,float64,spearman,0.0,1,0.965777,0.965777,Strong
254,srvserrorrate,dsthostsrvserrorrate,float64,spearman,0.0,1,0.942332,0.942332,Strong
237,serrorrate,dsthostserrorrate,float64,spearman,0.0,1,0.935943,0.935943,Strong
238,serrorrate,dsthostsrvserrorrate,float64,spearman,0.0,1,0.921663,0.921663,Strong


In [58]:
pearsondf[np.logical_and(pearsondf['status'] == 1,pearsondf['result'] == 'Strong')]

Unnamed: 0,var1,var2,var2type,test_type,pvalue,status,stat,effect,result
102,numcompromised,numroot,int64,pearson,0.0,1,0.998833,0.998833,Strong
225,serrorrate,srvserrorrate,float64,pearson,0.0,1,0.993289,0.993289,Strong
237,serrorrate,dsthostserrorrate,float64,pearson,0.0,1,0.979373,0.979373,Strong
238,serrorrate,dsthostsrvserrorrate,float64,pearson,0.0,1,0.981139,0.981139,Strong
253,srvserrorrate,dsthostserrorrate,float64,pearson,0.0,1,0.977596,0.977596,Strong
254,srvserrorrate,dsthostsrvserrorrate,float64,pearson,0.0,1,0.986252,0.986252,Strong
258,rerrorrate,srvrerrorrate,float64,pearson,0.0,1,0.989008,0.989008,Strong
270,rerrorrate,dsthostrerrorrate,float64,pearson,0.0,1,0.926749,0.926749,Strong
271,rerrorrate,dsthostsrvrerrorrate,float64,pearson,0.0,1,0.964449,0.964449,Strong
284,srvrerrorrate,dsthostrerrorrate,float64,pearson,0.0,1,0.917822,0.917822,Strong
