In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind,chi2_contingency,levene,MonteCarloMethod,PermutationMethod,shapiro,f_oneway,kruskal,pearsonr,spearmanr

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
sns.set_style('darkgrid')
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)  # Ensures that the display width isn't restricted
pd.set_option('display.max_rows', None)

In [4]:
df = pd.read_csv('Network_anomaly_data.csv')
df['is_attack'] = df['attack'].apply(lambda x: 'normal' if x=='normal' else 'attack')
df.head()

Unnamed: 0,duration,protocoltype,service,flag,srcbytes,dstbytes,land,wrongfragment,urgent,hot,numfailedlogins,loggedin,numcompromised,rootshell,suattempted,numroot,numfilecreations,numshells,numaccessfiles,numoutboundcmds,ishostlogin,isguestlogin,count,srvcount,serrorrate,srvserrorrate,rerrorrate,srvrerrorrate,samesrvrate,diffsrvrate,srvdiffhostrate,dsthostcount,dsthostsrvcount,dsthostsamesrvrate,dsthostdiffsrvrate,dsthostsamesrcportrate,dsthostsrvdiffhostrate,dsthostserrorrate,dsthostsrvserrorrate,dsthostrerrorrate,dsthostsrvrerrorrate,attack,lastflag,is_attack
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,150,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20,normal
1,0,udp,other,SF,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,1,0.0,0.0,0.0,0.0,0.08,0.15,0.0,255,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15,normal
2,0,tcp,private,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,123,6,1.0,1.0,0.0,0.0,0.05,0.07,0.0,255,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19,attack
3,0,tcp,http,SF,232,8153,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,5,5,0.2,0.2,0.0,0.0,1.0,0.0,0.0,30,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21,normal
4,0,tcp,http,SF,199,420,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,30,32,0.0,0.0,0.0,0.0,1.0,0.0,0.09,255,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21,normal


In [5]:
# land, wrongfragment,urgent, numfailedlogins,loggedin,rootshell,suattempted,numshells,ishostlogin,isguestlogin
df['land'] = df['land'].astype('category')
df['wrongfragment'] = df['wrongfragment'].astype('category')
df['urgent'] = df['urgent'].astype('category')
df['numfailedlogins'] = df['numfailedlogins'].astype('category')
df['loggedin'] = df['loggedin'].astype('category')
df['rootshell'] = df['rootshell'].astype('category')
df['suattempted'] = df['suattempted'].astype('category')
df['numshells'] = df['numshells'].astype('category')
df['ishostlogin'] = df['ishostlogin'].astype('category')
df['isguestlogin'] = df['isguestlogin'].astype('category')

In [6]:
def permutation_test_independence_cat(obs_table, num_permutations=10000):
    """
    Perform a permutation test of independence on a contingency table.

    Parameters:
    - obs_table: 2D numpy array (contingency table).
    - num_permutations: Number of permutations to perform.

    Returns:
    - p_value: The p-value of the test.
    """
    # Compute the observed chi-squared statistic
    observed_stat, _, _, _ = chi2_contingency(obs_table)

    # Get the shape of the contingency table
    num_rows, num_cols = obs_table.shape

    # Perform permutations
    permuted_stats = []
    for _ in range(num_permutations):
        # Shuffle the rows of the contingency table
        permuted_table = np.copy(obs_table)
        for i in range(num_cols):
            np.random.shuffle(permuted_table[:, i])  # Shuffle each column independently

        # Compute the chi-squared statistic for the permuted table
        try:
            permuted_stat, _, _, _ = chi2_contingency(permuted_table)
            permuted_stats.append(permuted_stat)
        except ValueError:
            # If Chi-squared cannot be computed (e.g., due to zero counts), skip this permutation
            continue

    # Convert the list of permuted statistics to a numpy array
    permuted_stats = np.array(permuted_stats)

    # Compute the p-value: proportion of permuted statistics >= observed statistic
    p_value = np.mean(permuted_stats >= observed_stat)
    avg_stat = np.mean(permuted_stats)

    return p_value,avg_stat


In [7]:
def permutation_test_independence2(obs_table, num_permutations=10000):
    """
    Perform a permutation test of independence on a contingency table.

    Parameters:
    - obs_table: 2D numpy array (contingency table).
    - num_permutations: Number of permutations to perform.

    Returns:
    - p_value: The p-value of the test.
    """
    # Compute the observed chi-squared statistic
    observed_stat, _, _, _ = chi2_contingency(obs_table)

    # Get the shape of the contingency table
    num_rows, num_cols = obs_table.shape

    # Perform permutations
    permuted_stats = []
    for _ in range(num_permutations):
        # Shuffle the rows of the contingency table
        permuted_table = np.copy(obs_table)
        for i in range(num_cols):
            np.random.shuffle(permuted_table[:, i])  # Shuffle each column independently

        # Compute the chi-squared statistic for the permuted table
        try:
            permuted_stat, _, _, _ = chi2_contingency(permuted_table)
            permuted_stats.append(permuted_stat)
        except ValueError:
            # If Chi-squared cannot be computed (e.g., due to zero counts), skip this permutation
            continue

    # Convert the list of permuted statistics to a numpy array
    permuted_stats = np.array(permuted_stats)

    # Compute the p-value: proportion of permuted statistics >= observed statistic
    p_value = np.mean(permuted_stats >= observed_stat)

    return p_value


In [8]:
def epsilon_squared_func(H_statistic,N,k):
  epsilon_squared = (H_statistic - k + 1) / (N - k)
  return epsilon_squared

In [9]:
def check_normality_no_sample(df,cat_var,numeric_var):
  groups = df[cat_var].unique()
  normality_results = dict()

  for group in groups:
    data = df[df[cat_var] == group][numeric_var]
    if len(data) >= 5000:
      data = data.sample(200)
    else:
      data = data.sample(200,replace=True)
    stats,p_value = shapiro(data)
    normality_results[group] = p_value
  return normality_results

In [10]:
def check_homogenitity(df,cat_var,numeric_var):
  groups = [df[df[cat_var] == group][numeric_var] for group in df[cat_var].unique()]
  stat, p_value = levene(*groups)
  return p_value

In [11]:
def cramers_v(chi2, n, contingency_table):
    phi2 = chi2 / n
    r, k = contingency_table.shape
    phi2corr = max(0, phi2 - (k-1)*(r-1) / (n-1))
    k_corr = min(k-1, r-1)
    return np.sqrt(phi2corr / k_corr)

In [12]:
alpha = 0.05
tst_cols = df.drop(['attack','is_attack','numoutboundcmds'],axis=1).columns
binary_test = pd.DataFrame(columns=['var1','var2','var2type','levene_pval','test_type','pvalue','status','stat','effect'])

for col in tst_cols:
  col_type = df[col].dtype
  if col_type == 'int64' or col_type == 'float64':
    grp1 = df[df['is_attack'] == 'normal'][col]
    grp2 = df[df['is_attack'] != 'normal'][col]

    levene_stat, levene_p_value = levene(grp1, grp2)
    test_type = None
    equal_variance = True
    if levene_p_value > 0.05:
      test_type = 'Regular'
    else:
      test_type = 'Welch'
      equal_variance = False

    t_stat, pvalue = ttest_ind(grp1, grp2,equal_var=equal_variance)
    epsilon = epsilon_squared_func(t_stat,df.shape[0],df['attack'].nunique())
    if pvalue <= 0.05:
      binary_test.loc[len(binary_test)] = {'var1':'is_attack','var2':col,'var2type':'numeric','levene_pval':levene_p_value,'test_type':test_type,'pvalue':pvalue,'status':1,'stat':t_stat,'effect':epsilon}
    else:
      binary_test.loc[len(binary_test)] = {'var1':'is_attack','var2':col,'var2type':'numeric','levene_pval':levene_p_value,'test_type':test_type,'pvalue':pvalue,'status':0,'stat':t_stat,'effect':epsilon}

  else:
    ctbl = pd.crosstab(df['is_attack'],df[col])
    l5 = np.any(ctbl<5)
    if not l5:
      chi_stat, p_value, dof, exp_freq = chi2_contingency(ctbl)
      crammers_v = cramers_v(chi_stat,df.shape[0],ctbl)
      if p_value <= 0.05:
        binary_test.loc[len(binary_test)] = {'var1':'is_attack','var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr','pvalue':p_value,'status':1,'stat':chi_stat,'effect':crammers_v}
      else:
        binary_test.loc[len(binary_test)] = {'var1':'is_attack','var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr','pvalue':p_value,'status':0,'stat':chi_stat,'effect':crammers_v}
    else:
      p_value,avg_stat = permutation_test_independence_cat(ctbl)
      crammers_v = cramers_v(avg_stat,df.shape[0],ctbl)
      if p_value <= 0.05:
        binary_test.loc[len(binary_test)] = {'var1':'is_attack','var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr_Permutation','pvalue':p_value,'status':1,'stat':avg_stat,'effect':crammers_v}
      else:
        binary_test.loc[len(binary_test)] = {'var1':'is_attack','var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr_Permutation','pvalue':p_value,'status':0,'stat':avg_stat,'effect':crammers_v}

In [13]:
alpha = 0.05

multi_test = pd.DataFrame(columns=['var1','var2','var2type','levene_pval','test_type','pvalue','status','stat','effect'])

for col in tst_cols:
  col_type = df[col].dtype
  if col_type == 'int64' or col_type == 'float64':
    rdict = check_normality_no_sample(df,'attack',col)
    normality_exists=True
    equal_vars = True
    for value in rdict.values():
      if value < 0.05:
        normality_exists = False
        break
    levene_pval = check_homogenitity(df,'attack',col)
    if levene_pval < 0.05:
      equal_vars = False

    if normality_exists and equal_vars:
      stat, p_value = f_oneway(*(df[df['attack'] == category][col] for category in df['attack'].unique()))
      if p_value < 0.05:
        status,effect = 1,1
        multi_test.loc[len(multi_test)] = {'var1':'attack','var2':col,'var2type':col_type,'normality':normality_exists,'homogenity':equal_vars,'test_type':'ANOVA','pvalue':p_value,'status':status,'stat':stat,'effect':effect}
      else:
        status,effect = 0,0
        multi_test.loc[len(multi_test)] = {'var1':'attack','var2':col,'var2type':col_type,'normality':normality_exists,'homogenity':equal_vars,'test_type':'ANOVA','pvalue':p_value,'status':status,'stat':stat,'effect':effect}

    else:
      groups = [group[col].values for _, group in df.groupby('attack')]
      #stat, p_value = kruskal(*(df[df['attack'] == category][col] for category in df['attack'].unique()))
      stat, p_value = kruskal(*groups)
      epsilon = epsilon_squared_func(stat,df.shape[0],df['attack'].nunique())
      if p_value < 0.05:
        status,effect = 1,1
        multi_test.loc[len(multi_test)] = {'var1':'attack','var2':col,'var2type':col_type,'normality':normality_exists,'homogenity':equal_vars,'test_type':'Kruskal','pvalue':p_value,'status':status,'stat':stat,'effect':epsilon}
      else:
        status,effect = 0,0
        multi_test.loc[len(multi_test)] = {'var1':'attack','var2':col,'var2type':col_type,'normality':normality_exists,'homogenity':equal_vars,'test_type':'Kruskal','pvalue':p_value,'status':status,'stat':stat,'effect':epsilon}
  else:
    ctbl = pd.crosstab(df['attack'],df[col])
    l5 = np.any(ctbl<5)
    if not l5:
      chi_stat, p_value, dof, exp_freq = chi2_contingency(ctbl)
      crammers_v = cramers_v(chi_stat,df.shape[0],ctbl)
      if p_value <= 0.05:
        multi_test.loc[len(multi_test)] = {'var1':'attack','var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr','pvalue':p_value,'status':1,'stat':chi_stat,'effect':crammers_v}
      else:
        multi_test.loc[len(multi_test)] = {'var1':'attack','var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr','pvalue':p_value,'status':0,'stat':chi_stat,'effect':crammers_v}
    else:
      p_value,avg_stat = permutation_test_independence_cat(ctbl)
      crammers_v = cramers_v(avg_stat,df.shape[0],ctbl)
      if p_value <= 0.05:
        multi_test.loc[len(multi_test)] = {'var1':'attack','var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr_Permutation','pvalue':p_value,'status':1,'stat':avg_stat,'effect':crammers_v}
      else:
        multi_test.loc[len(multi_test)] = {'var1':'attack','var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr_Permutation','pvalue':p_value,'status':0,'stat':avg_stat,'effect':crammers_v}


In [14]:
cat_cols = df.select_dtypes(include=['object','category']).columns
cat_cols = cat_cols[0:-2]

multi_category = pd.DataFrame(columns=['var1','var2','var2type','normality','homogenity','test_type','pvalue','status','stat','effect'])
chk_cols = df.drop('numoutboundcmds',axis=1).select_dtypes(include = ['int64','float64']).columns

for cat_var in cat_cols:
  for col in chk_cols:
    if col != cat_var:
      col_type = df[col].dtype
      if col_type != 'object':
        rdict = check_normality_no_sample(df,cat_var,col)
        normality_exists=True
        equal_vars = True
        for value in rdict.values():
          if value < 0.05:
            normality_exists = False
            break
        levene_pval = check_homogenitity(df,cat_var,col)
        if levene_pval < 0.05:
          equal_vars = False

        if normality_exists and equal_vars:
              stat, p_value = f_oneway(*(df[df[cat_var] == category][col] for category in df[cat_var].unique()))
              epsilon = epsilon_squared_func(stat,df.shape[0],df['attack'].nunique())
              if p_value < 0.05:
                status,effect = 1,1
                multi_category.loc[len(multi_category)] = {'var1':cat_var,'var2':col,'var2type':col_type,'normality':normality_exists,'homogenity':equal_vars,'test_type':'ANOVA','pvalue':p_value,'status':1,'stat':stat,'effect':epsilon}
              else:
                status,effect = 0,0
                multi_category.loc[len(multi_category)] = {'var1':cat_var,'var2':col,'var2type':col_type,'normality':normality_exists,'homogenity':equal_vars,'test_type':'ANOVA','pvalue':p_value,'status':0,'stat':stat,'effect':epsilon}
        else:
              stat, p_value = kruskal(*(df[df[cat_var] == category][col] for category in df[cat_var].unique()))
              epsilon = epsilon_squared_func(stat,df.shape[0],df['attack'].nunique())
              if p_value < 0.05:
                status,effect = 1,1
                multi_category.loc[len(multi_category)] = {'var1':cat_var,'var2':col,'var2type':col_type,'normality':normality_exists,'homogenity':equal_vars,'test_type':'Kruskal','pvalue':p_value,'status':1,'stat':stat,'effect':epsilon}
              else:
                status,effect = 0,0
                multi_category.loc[len(multi_category)] = {'var1':cat_var,'var2':col,'var2type':col_type,'normality':normality_exists,'homogenity':equal_vars,'test_type':'Kruskal','pvalue':p_value,'status':0,'stat':stat,'effect':epsilon}
      else:
        ctbl = pd.crosstab(df[cat_var],df[col])
        l5 = np.any(ctbl<5)
        if not l5:
          chi_stat, p_value, dof, exp_freq = chi2_contingency(ctbl)
          crammers_v = cramers_v(chi_stat,df.shape[0],ctbl)
          if p_value <= 0.05:
            multi_category.loc[len(multi_category)] = {'var1':cat_var,'var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr','pvalue':p_value,'status':1,'stat':stat,'effect':crammers_v}
          else:
            multi_category.loc[len(multi_category)] = {'var1':cat_var,'var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr','pvalue':p_value,'status':0,'stat':stat,'effect':crammers_v}
        else:
          #p_value = permutation_test_independence2(ctbl)
          p_value,avg_stat = permutation_test_independence_cat(ctbl)
          crammers_v = cramers_v(avg_stat,df.shape[0],ctbl)
          if p_value <= 0.05:
            multi_category.loc[len(multi_category)] = {'var1':cat_var,'var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr_Permutation','pvalue':p_value,'status':1,'stat':avg_stat,'effect':crammers_v}
          else:
            multi_category.loc[len(multi_category)] = {'var1':cat_var,'var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr_Permutation','pvalue':p_value,'status':0,'stat':avg_stat,'effect':crammers_v}


In [15]:
cat_cols = df.select_dtypes(include=['object','category']).columns

In [16]:
categorical_df = pd.DataFrame(columns=['var1','var2','var2type','test_type','pvalue','status','stat','effect'])
visited = set()
for col1 in cat_cols:
  for col2 in cat_cols:
    if (col1 != col2) and (col2 not in visited):
      ctbl = pd.crosstab(df[col1],df[col2])
      l5 = np.any(ctbl<5)
      if not l5:
        chi_stat, p_value, dof, exp_freq = chi2_contingency(ctbl)
        crammers_v = cramers_v(chi_stat,df.shape[0],ctbl)
        if p_value <= 0.05:
          categorical_df.loc[len(categorical_df)] = {'var1':col1,'var2':col2,'var2type':'categorical','test_type':'Chi_Sqr','pvalue':p_value,'status':1,'stat':chi_stat,'effect':crammers_v}
        else:
          categorical_df.loc[len(categorical_df)] = {'var1':col1,'var2':col2,'var2type':'categorical','test_type':'Chi_Sqr','pvalue':p_value,'status':0,'stat':chi_stat,'effect':crammers_v}
      else:
        p_value,avg_stat = permutation_test_independence_cat(ctbl)
        #stat, p_value = permutation_test(df, col1, col2)
        crammers_v = cramers_v(avg_stat,df.shape[0],ctbl)
        if p_value <= 0.05:
          categorical_df.loc[len(categorical_df)] = {'var1':col1,'var2':col2,'var2type':'categorical','test_type':'Chi_Sqr_Permutation','pvalue':p_value,'status':1,'stat':avg_stat,'effect':crammers_v}
        else:
          categorical_df.loc[len(categorical_df)] = {'var1':col1,'var2':col2,'var2type':'categorical','test_type':'Chi_Sqr_Permutation','pvalue':p_value,'status':0,'stat':avg_stat,'effect':crammers_v}
  visited.add(col1)

In [17]:
categorical_df['result'] = None
for i in range(len(categorical_df)):
  cv = categorical_df.loc[i,'effect']
  if cv < 0.1:
    categorical_df.loc[i,'result'] = 'Weak'
  elif cv >= 0.1 and cv < 0.3:
    categorical_df.loc[i,'result'] = 'Moderate'
  elif cv >= 0.3 and cv < 0.5:
    categorical_df.loc[i,'result'] = 'Strong'
  else:
    categorical_df.loc[i,'result'] = 'Very Strong'

In [18]:
num_cols = df.drop('numoutboundcmds',axis=1).select_dtypes(include=['int64','float64']).columns
visited = set()

pearsondf = pd.DataFrame(columns=['var1','var2','var2type','test_type','pvalue','status','stat','effect'])
for col1 in num_cols:
  for col2 in num_cols:
    if (col1 != col2) and (col2 not in visited):
      status1,status2 = 0,0
      pearson_corr, pearson_p = pearsonr(df[col1].dropna(), df[col2].dropna())

      if pearson_p <= 0.05:
        status1 = 1

      pearsondf.loc[len(pearsondf)] = {'var1':col1,'var2':col2,
                            'var2type':df[col2].dtype,'test_type':'pearson',
                           'pvalue':pearson_p,
                           'status':status1,'stat':pearson_corr,'effect':pearson_corr}
  visited.add(col1)

pearsondf['result'] = None
for i in range(len(pearsondf)):
  scorr = pearsondf.loc[i,'stat']

  if scorr >= 0.7:
    pearsondf.loc[i,'result'] = 'Strong'
  elif scorr >= 0.4 and scorr < 0.7:
    pearsondf.loc[i,'result'] = 'Moderate'
  else:
    pearsondf.loc[i,'result'] = 'Weak'

In [19]:
num_cols = df.drop('numoutboundcmds',axis=1).select_dtypes(include=['int64','float64']).columns
visited = set()

spearmandf = pd.DataFrame(columns=['var1','var2','var2type','test_type','pvalue','status','stat','effect'])
for col1 in num_cols:
  for col2 in num_cols:
    if (col1 != col2) and (col2 not in visited):
      status1,status2 = 0,0
      spearman_corr, spearman_p = spearmanr(df[col1].dropna(), df[col2].dropna())

      if spearman_p <= 0.05:
        status1 = 1

      spearmandf.loc[len(spearmandf)] = {'var1':col1,'var2':col2,
                            'var2type':df[col2].dtype,'test_type':'spearman',
                           'pvalue':spearman_p,
                           'status':status1,'stat':spearman_corr,'effect':spearman_corr}
  visited.add(col1)

spearmandf['result'] = None
for i in range(len(spearmandf)):
  scorr = spearmandf.loc[i,'stat']

  if scorr >= 0.7:
    spearmandf.loc[i,'result'] = 'Strong'
  elif scorr >= 0.4 and scorr < 0.7:
    spearmandf.loc[i,'result'] = 'Moderate'
  else:
    spearmandf.loc[i,'result'] = 'Weak'

In [20]:
spearmandf.head()

Unnamed: 0,var1,var2,var2type,test_type,pvalue,status,stat,effect,result
0,duration,srcbytes,int64,spearman,0.0,1,0.226289,0.226289,Weak
1,duration,dstbytes,int64,spearman,0.0,1,0.148983,0.148983,Weak
2,duration,hot,int64,spearman,0.0,1,0.229319,0.229319,Weak
3,duration,numcompromised,int64,spearman,8.213932e-180,1,0.08043,0.08043,Weak
4,duration,numroot,int64,spearman,2.257554e-50,1,0.042033,0.042033,Weak


In [21]:
binary_test.head()

Unnamed: 0,var1,var2,var2type,levene_pval,test_type,pvalue,status,stat,effect
0,is_attack,duration,numeric,3.02785e-67,Welch,1.171722e-60,1,-16.445123,-0.000305
1,is_attack,protocoltype,categorical,,Chi_Sqr,0.0,1,10029.248628,0.282132
2,is_attack,service,categorical,,Chi_Sqr_Permutation,0.1351,0,88362.295378,0.837192
3,is_attack,flag,categorical,,Chi_Sqr_Permutation,0.0398,1,52168.761539,0.643465
4,is_attack,srcbytes,numeric,0.03527225,Welch,0.04980998,1,-1.961633,-0.00019


In [22]:
multi_test.head()

Unnamed: 0,var1,var2,var2type,levene_pval,test_type,pvalue,status,stat,effect
0,attack,duration,int64,,Kruskal,0.0,1,14501.59,0.114963
1,attack,protocoltype,categorical,,Chi_Sqr_Permutation,0.991515,0,210283.6,0.913489
2,attack,service,categorical,,Chi_Sqr_Permutation,1.0,0,1718950.0,0.787208
3,attack,flag,categorical,,Chi_Sqr_Permutation,1.0,0,614839.4,0.698497
4,attack,srcbytes,int64,,Kruskal,0.0,1,93779.29,0.744401


In [23]:
multi_category.head()

Unnamed: 0,var1,var2,var2type,normality,homogenity,test_type,pvalue,status,stat,effect
0,protocoltype,duration,int64,False,False,Kruskal,2.1682429999999997e-296,1,1361.582541,0.010636
1,protocoltype,srcbytes,int64,False,True,Kruskal,0.0,1,1499.640942,0.011732
2,protocoltype,dstbytes,int64,False,True,Kruskal,0.0,1,6698.263234,0.053007
3,protocoltype,hot,int64,False,False,Kruskal,4.57683e-135,1,618.655962,0.004737
4,protocoltype,numcompromised,int64,False,True,Kruskal,1.072655e-64,1,294.590618,0.002164


In [24]:
categorical_df.head()

Unnamed: 0,var1,var2,var2type,test_type,pvalue,status,stat,effect,result
0,protocoltype,service,categorical,Chi_Sqr_Permutation,0.8958,0,223434.05333,0.941427,Very Strong
1,protocoltype,flag,categorical,Chi_Sqr_Permutation,1.0,0,100788.435389,0.632424,Very Strong
2,protocoltype,land,categorical,Chi_Sqr_Permutation,1.0,0,181.57665,0.037756,Weak
3,protocoltype,wrongfragment,categorical,Chi_Sqr_Permutation,0.4421,0,7474.417047,0.172194,Moderate
4,protocoltype,urgent,categorical,Chi_Sqr_Permutation,1.0,0,65.219433,0.015331,Weak


In [34]:
binary_test[binary_test['status'] == 1]['var2'].values

array(['duration', 'protocoltype', 'flag', 'srcbytes', 'land', 'hot',
       'loggedin', 'numcompromised', 'rootshell', 'numroot',
       'numfilecreations', 'numaccessfiles', 'isguestlogin', 'count',
       'serrorrate', 'srvserrorrate', 'rerrorrate', 'srvrerrorrate',
       'samesrvrate', 'diffsrvrate', 'srvdiffhostrate', 'dsthostcount',
       'dsthostsrvcount', 'dsthostsamesrvrate', 'dsthostdiffsrvrate',
       'dsthostsamesrcportrate', 'dsthostsrvdiffhostrate',
       'dsthostserrorrate', 'dsthostsrvserrorrate', 'dsthostrerrorrate',
       'dsthostsrvrerrorrate', 'lastflag'], dtype=object)

In [35]:
binary_test[binary_test['status'] == 0]['var2'].values

array(['service', 'dstbytes', 'wrongfragment', 'urgent',
       'numfailedlogins', 'suattempted', 'numshells', 'ishostlogin',
       'srvcount'], dtype=object)

In [36]:
binary_test[binary_test['status'] == 1].sort_values('effect',ascending=False).head()

Unnamed: 0,var1,var2,var2type,levene_pval,test_type,pvalue,status,stat,effect
11,is_attack,loggedin,categorical,,Chi_Sqr,0.0,1,60002.601329,0.690149
3,is_attack,flag,categorical,,Chi_Sqr_Permutation,0.0398,1,52168.761539,0.643465
1,is_attack,protocoltype,categorical,,Chi_Sqr,0.0,1,10029.248628,0.282132
20,is_attack,isguestlogin,categorical,,Chi_Sqr,5.360231e-44,1,193.542192,0.039095
13,is_attack,rootshell,categorical,,Chi_Sqr,1.058543e-12,1,50.732461,0.019869


In [25]:
b1 = binary_test.drop('levene_pval',axis=1)
m1 = multi_test.drop('levene_pval',axis=1)
m2 = multi_category.drop(['normality','homogenity'],axis=1)

In [26]:
for i in range(len(b1)):
  ttype = b1.loc[i,'test_type']
  cv = b1.loc[i,'effect']

  if ttype == 'Chi_Sqr_Permutation' or ttype == 'Chi_Sqr':
    if cv < 0.1:
      b1.loc[i,'result'] = 'Weak'
    elif cv >= 0.1 and cv < 0.3:
      b1.loc[i,'result'] = 'Moderate'
    elif cv >= 0.3 and cv < 0.5:
      b1.loc[i,'result'] = 'Strong'
    else:
      b1.loc[i,'result'] = 'Very Strong'
  else:
    if cv < 0.01:
      b1.loc[i,'result'] = 'Small'
    elif cv >= 0.01 and cv < 0.14:
      b1.loc[i,'result'] = 'Medium'
    else:
      b1.loc[i,'result'] = 'Large'

In [27]:
for i in range(len(m1)):
  ttype = m1.loc[i,'test_type']
  cv = m1.loc[i,'effect']

  if ttype == 'Chi_Sqr_Permutation' or ttype == 'Chi_Sqr':
    if cv < 0.1:
      m1.loc[i,'result'] = 'Weak'
    elif cv >= 0.1 and cv < 0.3:
      m1.loc[i,'result'] = 'Moderate'
    elif cv >= 0.3 and cv < 0.5:
      m1.loc[i,'result'] = 'Strong'
    else:
      m1.loc[i,'result'] = 'Very Strong'
  else:
    if cv < 0.01:
      m1.loc[i,'result'] = 'Small'
    elif cv >= 0.01 and cv < 0.14:
      m1.loc[i,'result'] = 'Medium'
    else:
      m1.loc[i,'result'] = 'Large'

In [28]:
for i in range(len(m2)):
  ttype = m2.loc[i,'test_type']
  cv = m2.loc[i,'effect']

  if ttype == 'Chi_Sqr_Permutation' or ttype == 'Chi_Sqr':
    if cv < 0.1:
      m2.loc[i,'result'] = 'Weak'
    elif cv >= 0.1 and cv < 0.3:
      m2.loc[i,'result'] = 'Moderate'
    elif cv >= 0.3 and cv < 0.5:
      m2.loc[i,'result'] = 'Strong'
    else:
      m2.loc[i,'result'] = 'Very Strong'
  else:
    if cv < 0.01:
      m2.loc[i,'result'] = 'Small'
    elif cv >= 0.01 and cv < 0.14:
      m2.loc[i,'result'] = 'Medium'
    else:
      m2.loc[i,'result'] = 'Large'

In [29]:
final = pd.concat([m1,m2,categorical_df,pearsondf,spearmandf],axis=0)
final.reset_index(inplace=True)
final.drop('index',axis=1,inplace=True)
final.head()

Unnamed: 0,var1,var2,var2type,test_type,pvalue,status,stat,effect,result
0,attack,duration,int64,Kruskal,0.0,1,14501.59,0.114963,Medium
1,attack,protocoltype,categorical,Chi_Sqr_Permutation,0.991515,0,210283.6,0.913489,Very Strong
2,attack,service,categorical,Chi_Sqr_Permutation,1.0,0,1718950.0,0.787208,Very Strong
3,attack,flag,categorical,Chi_Sqr_Permutation,1.0,0,614839.4,0.698497,Very Strong
4,attack,srcbytes,int64,Kruskal,0.0,1,93779.29,0.744401,Large


In [30]:
final.sort_values(['var1','test_type'],ascending=True,inplace=True)

In [31]:
final.head()

Unnamed: 0,var1,var2,var2type,test_type,pvalue,status,stat,effect,result
1,attack,protocoltype,categorical,Chi_Sqr_Permutation,0.991515,0,210283.6,0.913489,Very Strong
2,attack,service,categorical,Chi_Sqr_Permutation,1.0,0,1718950.0,0.787208,Very Strong
3,attack,flag,categorical,Chi_Sqr_Permutation,1.0,0,614839.4,0.698497,Very Strong
6,attack,land,categorical,Chi_Sqr_Permutation,0.662831,0,81474.34,0.804106,Very Strong
7,attack,wrongfragment,categorical,Chi_Sqr_Permutation,0.315874,0,189143.4,0.866346,Very Strong


In [32]:
final.to_csv('NAD_hyp_results_no_duplicates.csv')

In [33]:
#b1.to_csv('NAD_attack_binary.csv')