In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind,chi2_contingency,levene,MonteCarloMethod,PermutationMethod,shapiro,f_oneway,kruskal,pearsonr,spearmanr

In [2]:
sns.set_style('darkgrid')
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)  # Ensures that the display width isn't restricted
pd.set_option('display.max_rows', None)

In [3]:
df = pd.read_csv('Network_anomaly_data.csv')
df['is_attack'] = df['attack'].apply(lambda x: 'normal' if x=='normal' else 'attack')
df.head()

Unnamed: 0,duration,protocoltype,service,flag,srcbytes,dstbytes,land,wrongfragment,urgent,hot,numfailedlogins,loggedin,numcompromised,rootshell,suattempted,numroot,numfilecreations,numshells,numaccessfiles,numoutboundcmds,ishostlogin,isguestlogin,count,srvcount,serrorrate,srvserrorrate,rerrorrate,srvrerrorrate,samesrvrate,diffsrvrate,srvdiffhostrate,dsthostcount,dsthostsrvcount,dsthostsamesrvrate,dsthostdiffsrvrate,dsthostsamesrcportrate,dsthostsrvdiffhostrate,dsthostserrorrate,dsthostsrvserrorrate,dsthostrerrorrate,dsthostsrvrerrorrate,attack,lastflag,is_attack
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,150,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20,normal
1,0,udp,other,SF,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,1,0.0,0.0,0.0,0.0,0.08,0.15,0.0,255,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15,normal
2,0,tcp,private,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,123,6,1.0,1.0,0.0,0.0,0.05,0.07,0.0,255,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19,attack
3,0,tcp,http,SF,232,8153,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,5,5,0.2,0.2,0.0,0.0,1.0,0.0,0.0,30,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21,normal
4,0,tcp,http,SF,199,420,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,30,32,0.0,0.0,0.0,0.0,1.0,0.0,0.09,255,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21,normal


In [4]:
# land, wrongfragment,urgent, numfailedlogins,loggedin,rootshell,suattempted,numshells,ishostlogin,isguestlogin
df['land'] = df['land'].astype('category')
df['wrongfragment'] = df['wrongfragment'].astype('category')
df['urgent'] = df['urgent'].astype('category')
df['numfailedlogins'] = df['numfailedlogins'].astype('category')
df['loggedin'] = df['loggedin'].astype('category')
df['rootshell'] = df['rootshell'].astype('category')
df['suattempted'] = df['suattempted'].astype('category')
df['numshells'] = df['numshells'].astype('category')
df['ishostlogin'] = df['ishostlogin'].astype('category')
df['isguestlogin'] = df['isguestlogin'].astype('category')

In [5]:
tst_cols = df.drop(['attack','is_attack','numoutboundcmds'],axis=1).columns
tst_cols

Index(['duration', 'protocoltype', 'service', 'flag', 'srcbytes', 'dstbytes', 'land',
       'wrongfragment', 'urgent', 'hot', 'numfailedlogins', 'loggedin', 'numcompromised',
       'rootshell', 'suattempted', 'numroot', 'numfilecreations', 'numshells', 'numaccessfiles',
       'ishostlogin', 'isguestlogin', 'count', 'srvcount', 'serrorrate', 'srvserrorrate',
       'rerrorrate', 'srvrerrorrate', 'samesrvrate', 'diffsrvrate', 'srvdiffhostrate',
       'dsthostcount', 'dsthostsrvcount', 'dsthostsamesrvrate', 'dsthostdiffsrvrate',
       'dsthostsamesrcportrate', 'dsthostsrvdiffhostrate', 'dsthostserrorrate',
       'dsthostsrvserrorrate', 'dsthostrerrorrate', 'dsthostsrvrerrorrate', 'lastflag'],
      dtype='object')

In [6]:
def permutation_test_independence(obs_table, num_permutations=10000):
    """
    Perform a permutation test of independence on a contingency table.

    Parameters:
    - obs_table: 2D numpy array (contingency table).
    - num_permutations: Number of permutations to perform.

    Returns:
    - p_value: The p-value of the test.
    """
    # Compute the observed test statistic (e.g., Chi-squared statistic)
    observed_stat = chi2_contingency(obs_table)[0]

    # Perform permutations
    permuted_stats = []
    for _ in range(num_permutations):
        # Shuffle rows (or columns) to simulate the null hypothesis of independence
        permuted_table = np.random.permutation(obs_table)
        permuted_stat = chi2_contingency(permuted_table)[0]
        permuted_stats.append(permuted_stat)

    # Compute the p-value as the proportion of permuted statistics greater than or equal to the observed statistic
    permuted_stats = np.array(permuted_stats)
    p_value = np.mean(permuted_stats >= observed_stat)

    return p_value

In [7]:
def permutation_test_independence2(obs_table, num_permutations=10000):
    """
    Perform a permutation test of independence on a contingency table.

    Parameters:
    - obs_table: 2D numpy array (contingency table).
    - num_permutations: Number of permutations to perform.

    Returns:
    - p_value: The p-value of the test.
    """
    # Compute the observed chi-squared statistic
    observed_stat, _, _, _ = chi2_contingency(obs_table)

    # Get the shape of the contingency table
    num_rows, num_cols = obs_table.shape

    # Perform permutations
    permuted_stats = []
    for _ in range(num_permutations):
        # Shuffle the rows of the contingency table
        permuted_table = np.copy(obs_table)
        for i in range(num_cols):
            np.random.shuffle(permuted_table[:, i])  # Shuffle each column independently

        # Compute the chi-squared statistic for the permuted table
        try:
            permuted_stat, _, _, _ = chi2_contingency(permuted_table)
            permuted_stats.append(permuted_stat)
        except ValueError:
            # If Chi-squared cannot be computed (e.g., due to zero counts), skip this permutation
            continue

    # Convert the list of permuted statistics to a numpy array
    permuted_stats = np.array(permuted_stats)

    # Compute the p-value: proportion of permuted statistics >= observed statistic
    p_value = np.mean(permuted_stats >= observed_stat)

    return p_value


In [8]:
alpha = 0.05

binary_test = pd.DataFrame(columns=['var1','var2','var2type','levene_pval','test_type','pvalue','status','effect'])

for col in tst_cols:
  col_type = df[col].dtype
  if col_type == 'int64' or col_type == 'float64':
    grp1 = df[df['is_attack'] == 'normal'][col]
    grp2 = df[df['is_attack'] != 'normal'][col]

    levene_stat, levene_p_value = levene(grp1, grp2)
    test_type = None
    equal_variance = True
    if levene_p_value > 0.05:
      test_type = 'Regular'
    else:
      test_type = 'Welch'
      equal_variance = False

    t_stat, pvalue = ttest_ind(grp1, grp2,equal_var=equal_variance)
    if pvalue <= 0.05:
      binary_test.loc[len(binary_test)] = {'var1':'is_attack','var2':col,'var2type':'numeric','levene_pval':levene_p_value,'test_type':test_type,'pvalue':pvalue,'status':1,'effect':1}
    else:
      binary_test.loc[len(binary_test)] = {'var1':'is_attack','var2':col,'var2type':'numeric','levene_pval':levene_p_value,'test_type':test_type,'pvalue':pvalue,'status':0,'effect':0}

  else:
    ctbl = pd.crosstab(df['is_attack'],df[col])
    l5 = np.any(ctbl<5)
    if not l5:
      chi_stat, p_value, dof, exp_freq = chi2_contingency(ctbl)
      if p_value <= 0.05:
        binary_test.loc[len(binary_test)] = {'var1':'is_attack','var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr','pvalue':p_value,'status':0,'effect':1}
      else:
        binary_test.loc[len(binary_test)] = {'var1':'is_attack','var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr','pvalue':p_value,'status':1,'effect':0}
    else:
      p_value = permutation_test_independence2(ctbl)
      if p_value <= 0.05:
        binary_test.loc[len(binary_test)] = {'var1':'is_attack','var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr_Permutation','pvalue':p_value,'status':1,'effect':1}
      else:
        binary_test.loc[len(binary_test)] = {'var1':'is_attack','var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr_Permutation','pvalue':p_value,'status':0,'effect':0}

In [9]:
binary_test['pvalue'] = binary_test['pvalue'].apply(lambda x: f"{x:.10f}")
binary_test['levene_pval'] = binary_test['levene_pval'].apply(lambda x: f"{x:.10f}")

In [10]:
binary_test['test_type'].value_counts()

Unnamed: 0_level_0,count
test_type,Unnamed: 1_level_1
Welch,27
Chi_Sqr_Permutation,8
Chi_Sqr,5
Regular,1


In [11]:
binary_test['var2type'].value_counts()

Unnamed: 0_level_0,count
var2type,Unnamed: 1_level_1
numeric,28
categorical,13


In [12]:
binary_test['status'].value_counts()

Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
1,27
0,14


In [13]:
binary_test[binary_test['var2type'] == 'categorical']

Unnamed: 0,var1,var2,var2type,levene_pval,test_type,pvalue,status,effect
1,is_attack,protocoltype,categorical,,Chi_Sqr,0.0,0,1
2,is_attack,service,categorical,,Chi_Sqr_Permutation,0.1374,0,0
3,is_attack,flag,categorical,,Chi_Sqr_Permutation,0.0376,1,1
6,is_attack,land,categorical,,Chi_Sqr,0.0186882052,0,1
7,is_attack,wrongfragment,categorical,,Chi_Sqr_Permutation,0.2483,0,0
8,is_attack,urgent,categorical,,Chi_Sqr_Permutation,1.0,0,0
10,is_attack,numfailedlogins,categorical,,Chi_Sqr_Permutation,0.9119,0,0
11,is_attack,loggedin,categorical,,Chi_Sqr,0.0,0,1
13,is_attack,rootshell,categorical,,Chi_Sqr,0.0,0,1
14,is_attack,suattempted,categorical,,Chi_Sqr_Permutation,1.0,0,0


In [14]:
binary_test[binary_test['test_type'] == 'Regular']

Unnamed: 0,var1,var2,var2type,levene_pval,test_type,pvalue,status,effect
5,is_attack,dstbytes,numeric,0.1423293162,Regular,0.1439015781,0,0


In [15]:
binary_test[binary_test['status'] == 1].shape

(27, 8)

In [16]:
binary_test['pvalue'].unique()

array(['0.0000000000', '0.1374000000', '0.0376000000', '0.0498099770',
       '0.1439015781', '0.0186882052', '0.2483000000', '1.0000000000',
       '0.0000026338', '0.9119000000', '0.0001047923', '0.0000132306',
       '0.7570000000', '0.7891722826'], dtype=object)

In [17]:
binary_test.loc[21:,:].var2.values

array(['count', 'srvcount', 'serrorrate', 'srvserrorrate', 'rerrorrate',
       'srvrerrorrate', 'samesrvrate', 'diffsrvrate', 'srvdiffhostrate',
       'dsthostcount', 'dsthostsrvcount', 'dsthostsamesrvrate',
       'dsthostdiffsrvrate', 'dsthostsamesrcportrate',
       'dsthostsrvdiffhostrate', 'dsthostserrorrate',
       'dsthostsrvserrorrate', 'dsthostrerrorrate',
       'dsthostsrvrerrorrate', 'lastflag'], dtype=object)

In [18]:
def check_normality(df,cat_var,numeric_var):
  groups = df[cat_var].unique()
  normality_results = dict()

  for group in groups:
    data = df[df[cat_var] == group][numeric_var]
    if len(data) >= 5000:
      data = data.sample(5000)
    else:
      #print('with replacement')
      data = data.sample(5000,replace=True)
    stats,p_value = shapiro(data)
    normality_results[group] = p_value
  return normality_results

In [19]:
def check_normality_no_sample(df,cat_var,numeric_var):
  groups = df[cat_var].unique()
  normality_results = dict()

  for group in groups:
    data = df[df[cat_var] == group][numeric_var]
    if len(data) >= 5000:
      data = data.sample(200)
    else:
      data = data.sample(200,replace=True)
    stats,p_value = shapiro(data)
    normality_results[group] = p_value
  return normality_results

In [20]:
def check_homogenitity(df,cat_var,numeric_var):
  groups = [df[df[cat_var] == group][numeric_var] for group in df[cat_var].unique()]
  stat, p_value = levene(*groups)
  return p_value

In [21]:
alpha = 0.05

multi_test = pd.DataFrame(columns=['var1','var2','var2type','levene_pval','test_type','pvalue','status','effect'])

for col in tst_cols:
  col_type = df[col].dtype
  if col_type == 'int64' or col_type == 'float64':
    rdict = check_normality_no_sample(df,'attack',col)
    normality_exists=True
    equal_vars = True
    for value in rdict.values():
      if value < 0.05:
        normality_exists = False
        break
    levene_pval = check_homogenitity(df,'attack',col)
    if levene_pval < 0.05:
      equal_vars = False

    if normality_exists and equal_vars:
      stat, p_value = f_oneway(*(df[df['attack'] == category][col] for category in df['attack'].unique()))
      if p_value < 0.05:
        status,effect = 1,1
        multi_test.loc[len(multi_test)] = {'var1':'attack','var2':col,'var2type':col_type,'normality':normality_exists,'homogenity':equal_vars,'test_type':'ANOVA','pvalue':p_value,'status':status,'effect':effect}
      else:
        status,effect = 0,0
        multi_test.loc[len(multi_test)] = {'var1':'attack','var2':col,'var2type':col_type,'normality':normality_exists,'homogenity':equal_vars,'test_type':'ANOVA','pvalue':p_value,'status':status,'effect':effect}

    else:
      groups = [group[col].values for _, group in df.groupby('attack')]
      #stat, p_value = kruskal(*(df[df['attack'] == category][col] for category in df['attack'].unique()))
      stat, p_value = kruskal(*groups)
      print(stat)
      if p_value < 0.05:
        status,effect = 1,1
        multi_test.loc[len(multi_test)] = {'var1':'attack','var2':col,'var2type':col_type,'normality':normality_exists,'homogenity':equal_vars,'test_type':'Kruskal','pvalue':p_value,'status':status,'effect':effect}
      else:
        status,effect = 0,0
        multi_test.loc[len(multi_test)] = {'var1':'attack','var2':col,'var2type':col_type,'normality':normality_exists,'homogenity':equal_vars,'test_type':'Kruskal','pvalue':p_value,'status':status,'effect':effect}
  else:
    ctbl = pd.crosstab(df['attack'],df[col])
    l5 = np.any(ctbl<5)
    if not l5:
      chi_stat, p_value, dof, exp_freq = chi2_contingency(ctbl)
      if pvalue <= 0.05:
        multi_test.loc[len(multi_test)] = {'var1':'attack','var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr','pvalue':p_value,'status':0,'effect':0}
      else:
        multi_test.loc[len(multi_test)] = {'var1':'attack','var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr','pvalue':p_value,'status':1,'effect':1}
    else:
      p_value = permutation_test_independence2(ctbl)
      if pvalue <= 0.05:
        multi_test.loc[len(multi_test)] = {'var1':'attack','var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr_Permutation','pvalue':p_value,'status':0,'effect':0}
      else:
        multi_test.loc[len(multi_test)] = {'var1':'attack','var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr_Permutation','pvalue':p_value,'status':1,'effect':1}


  res = hypotest_fun_out(*samples, **kwds)


14501.58833252334


  res = hypotest_fun_out(*samples, **kwds)


93779.2935865008


  res = hypotest_fun_out(*samples, **kwds)


78362.59978320696


  res = hypotest_fun_out(*samples, **kwds)


53593.30188105316


  res = hypotest_fun_out(*samples, **kwds)


80515.82931298067


  res = hypotest_fun_out(*samples, **kwds)


1670.6156685996118


  res = hypotest_fun_out(*samples, **kwds)


6636.533330101895


  res = hypotest_fun_out(*samples, **kwds)


2287.442557621949


  res = hypotest_fun_out(*samples, **kwds)


76048.43396581677


  res = hypotest_fun_out(*samples, **kwds)


19351.1984382552


  res = hypotest_fun_out(*samples, **kwds)


87451.95166227038


  res = hypotest_fun_out(*samples, **kwds)


89128.59171289275


  res = hypotest_fun_out(*samples, **kwds)


31101.48232256029


  res = hypotest_fun_out(*samples, **kwds)


30139.259116727164


  res = hypotest_fun_out(*samples, **kwds)


104818.13064825076


  res = hypotest_fun_out(*samples, **kwds)


91000.42428191256


  res = hypotest_fun_out(*samples, **kwds)


29854.946658898734


  res = hypotest_fun_out(*samples, **kwds)


49837.13277827315
69979.47736441917


  res = hypotest_fun_out(*samples, **kwds)


77486.96664560083


  res = hypotest_fun_out(*samples, **kwds)


64000.231121809375


  res = hypotest_fun_out(*samples, **kwds)


58569.14346350001


  res = hypotest_fun_out(*samples, **kwds)


45128.88888670074


  res = hypotest_fun_out(*samples, **kwds)


79912.63607554644


  res = hypotest_fun_out(*samples, **kwds)


83246.32879585706


  res = hypotest_fun_out(*samples, **kwds)


28862.880338502968


  res = hypotest_fun_out(*samples, **kwds)


28394.836673927275
47810.9409731384


In [22]:
multi_test['pvalue'].unique()

array([0.        , 0.98986058, 1.        , 0.99988575, 0.65628604,
       0.32286634, 0.8573    , 0.9139    , 0.92212952, 0.90175439,
       0.9462    , 0.8471223 , 0.9926    , 0.9684    ])

In [23]:
multi_test['test_type'].value_counts()

Unnamed: 0_level_0,count
test_type,Unnamed: 1_level_1
Kruskal,28
Chi_Sqr_Permutation,13


In [24]:
multi_test

Unnamed: 0,var1,var2,var2type,levene_pval,test_type,pvalue,status,effect
0,attack,duration,int64,,Kruskal,0.0,1,1
1,attack,protocoltype,categorical,,Chi_Sqr_Permutation,0.989861,0,0
2,attack,service,categorical,,Chi_Sqr_Permutation,1.0,0,0
3,attack,flag,categorical,,Chi_Sqr_Permutation,0.999886,0,0
4,attack,srcbytes,int64,,Kruskal,0.0,1,1
5,attack,dstbytes,int64,,Kruskal,0.0,1,1
6,attack,land,categorical,,Chi_Sqr_Permutation,0.656286,0,0
7,attack,wrongfragment,categorical,,Chi_Sqr_Permutation,0.322866,0,0
8,attack,urgent,categorical,,Chi_Sqr_Permutation,0.8573,0,0
9,attack,hot,int64,,Kruskal,0.0,1,1


In [25]:
cat_cols = df.select_dtypes(include=['object','category']).columns
cat_cols = cat_cols[0:-2]
cat_cols

Index(['protocoltype', 'service', 'flag', 'land', 'wrongfragment', 'urgent', 'numfailedlogins',
       'loggedin', 'rootshell', 'suattempted', 'numshells', 'ishostlogin', 'isguestlogin'],
      dtype='object')

In [26]:
multi_category = pd.DataFrame(columns=['var1','var2','var2type','normality','homogenity','test_type','pvalue','status','effect'])
chk_cols = df.drop('numoutboundcmds',axis=1).select_dtypes(include = ['int64','float64']).columns

for cat_var in cat_cols:
  for col in chk_cols:
    if col != cat_var:
      col_type = df[col].dtype
      if col_type != 'object':
        rdict = check_normality(df,cat_var,col)
        normality_exists=True
        equal_vars = True
        for value in rdict.values():
          if value < 0.05:
            normality_exists = False
            break
        levene_pval = check_homogenitity(df,cat_var,col)
        if levene_pval < 0.05:
          equal_vars = False

        if normality_exists and equal_vars:
              stat, p_value = f_oneway(*(df[df[cat_var] == category][col] for category in df[cat_var].unique()))
              if p_value < 0.05:
                status,effect = 1,1
                multi_category.loc[len(multi_category)] = {'var1':cat_var,'var2':col,'var2type':col_type,'normality':normality_exists,'homogenity':equal_vars,'test_type':'ANOVA','pvalue':p_value,'status':status,'effect':effect}
              else:
                status,effect = 0,0
                multi_category.loc[len(multi_category)] = {'var1':cat_var,'var2':col,'var2type':col_type,'normality':normality_exists,'homogenity':equal_vars,'test_type':'ANOVA','pvalue':p_value,'status':status,'effect':effect}
        else:
              stat, p_value = kruskal(*(df[df[cat_var] == category][col] for category in df[cat_var].unique()))
              if p_value < 0.05:
                status,effect = 1,1
                multi_category.loc[len(multi_category)] = {'var1':cat_var,'var2':col,'var2type':col_type,'normality':normality_exists,'homogenity':equal_vars,'test_type':'Kruskal','pvalue':p_value,'status':status,'effect':effect}
              else:
                status,effect = 0,0
                multi_category.loc[len(multi_category)] = {'var1':cat_var,'var2':col,'var2type':col_type,'normality':normality_exists,'homogenity':equal_vars,'test_type':'Kruskal','pvalue':p_value,'status':status,'effect':effect}
      else:
        ctbl = pd.crosstab(df[cat_var],df[col])
        l5 = np.any(ctbl<5)
        if not l5:
          chi_stat, p_value, dof, exp_freq = chi2_contingency(ctbl)
          if p_value <= 0.05:
            multi_category.loc[len(multi_category)] = {'var1':cat_var,'var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr','pvalue':p_value,'status':0,'effect':0}
          else:
            multi_category.loc[len(multi_category)] = {'var1':cat_var,'var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr','pvalue':p_value,'status':1,'effect':1}
        else:
          p_value = permutation_test_independence2(ctbl)
          if pvalue <= 0.05:
            multi_category.loc[len(multi_category)] = {'var1':cat_var,'var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr_Permutation','pvalue':p_value,'status':0,'effect':0}
          else:
            multi_category.loc[len(multi_category)] = {'var1':cat_var,'var2':col,'var2type':'categorical','levene_pval':np.nan,'test_type':'Chi_Sqr_Permutation','pvalue':p_value,'status':1,'effect':1}


  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hyp

In [27]:
multi_category.head()

Unnamed: 0,var1,var2,var2type,normality,homogenity,test_type,pvalue,status,effect
0,protocoltype,duration,int64,False,False,Kruskal,2.1682429999999997e-296,1,1
1,protocoltype,srcbytes,int64,False,True,Kruskal,0.0,1,1
2,protocoltype,dstbytes,int64,False,True,Kruskal,0.0,1,1
3,protocoltype,hot,int64,False,False,Kruskal,4.57683e-135,1,1
4,protocoltype,numcompromised,int64,False,True,Kruskal,1.072655e-64,1,1


In [28]:
multi_category['test_type'].value_counts()

Unnamed: 0_level_0,count
test_type,Unnamed: 1_level_1
Kruskal,364


In [47]:
multi_category['var2type'].unique()

array([dtype('int64'), dtype('float64')], dtype=object)

In [29]:
multi_category['pvalue'].nunique()

250

In [30]:
multi_test.shape

(41, 8)

In [31]:
multi_test

Unnamed: 0,var1,var2,var2type,levene_pval,test_type,pvalue,status,effect
0,attack,duration,int64,,Kruskal,0.0,1,1
1,attack,protocoltype,categorical,,Chi_Sqr_Permutation,0.989861,0,0
2,attack,service,categorical,,Chi_Sqr_Permutation,1.0,0,0
3,attack,flag,categorical,,Chi_Sqr_Permutation,0.999886,0,0
4,attack,srcbytes,int64,,Kruskal,0.0,1,1
5,attack,dstbytes,int64,,Kruskal,0.0,1,1
6,attack,land,categorical,,Chi_Sqr_Permutation,0.656286,0,0
7,attack,wrongfragment,categorical,,Chi_Sqr_Permutation,0.322866,0,0
8,attack,urgent,categorical,,Chi_Sqr_Permutation,0.8573,0,0
9,attack,hot,int64,,Kruskal,0.0,1,1


In [32]:
categorical_df = pd.DataFrame(columns=['var1','var2','test_type','pvalue','status','effect'])
visited = set()
for col1 in cat_cols:
  for col2 in cat_cols:
    if (col1 != col2) and (col2 not in visited):
      ctbl = pd.crosstab(df[col1],df[col2])
      l5 = np.any(ctbl<5)
      if not l5:
        chi_stat, p_value, dof, exp_freq = chi2_contingency(ctbl)
        if p_value <= 0.05:
          categorical_df.loc[len(categorical_df)] = {'var1':col1,'var2':col2,'test_type':'Chi_Sqr','pvalue':p_value,'status':0,'effect':0}
        else:
          categorical_df.loc[len(categorical_df)] = {'var1':col1,'var2':col2,'test_type':'Chi_Sqr','pvalue':p_value,'status':1,'effect':1}
      else:
        p_value = permutation_test_independence2(ctbl)
        #stat, p_value = permutation_test(df, col1, col2)
        if p_value <= 0.05:
          categorical_df.loc[len(categorical_df)] = {'var1':col1,'var2':col2,'test_type':'Chi_Sqr_Permutation','pvalue':p_value,'status':0,'effect':0}
        else:
          categorical_df.loc[len(categorical_df)] = {'var1':col1,'var2':col2,'test_type':'Chi_Sqr_Permutation','pvalue':p_value,'status':1,'effect':1}
  visited.add(col1)

In [33]:
categorical_df.shape

(78, 6)

In [34]:
categorical_df['pvalue'].unique()

array([0.8961, 1.    , 0.4416, 0.8781, 0.9956, 0.9945, 0.9969, 0.9556,
       0.9826, 0.9733, 0.9953, 0.8693, 0.6531, 0.9016, 0.998 , 0.9998,
       0.9966, 0.8478, 0.9651, 0.9767, 0.9994, 0.9987, 0.9634, 0.8727,
       0.9977, 0.981 , 0.889 , 0.7852, 0.891 , 0.9742, 0.9996, 0.9145,
       0.9944, 0.9988, 0.9387, 0.4974, 0.1252, 0.2516, 0.494 , 0.    ,
       0.5011, 0.7479, 0.9364, 0.8349])

In [35]:
categorical_df.columns

Index(['var1', 'var2', 'test_type', 'pvalue', 'status', 'effect'], dtype='object')

In [36]:
df.drop('numoutboundcmds',axis=1).select_dtypes(include=['int64','float64']).columns

Index(['duration', 'srcbytes', 'dstbytes', 'hot', 'numcompromised', 'numroot', 'numfilecreations',
       'numaccessfiles', 'count', 'srvcount', 'serrorrate', 'srvserrorrate', 'rerrorrate',
       'srvrerrorrate', 'samesrvrate', 'diffsrvrate', 'srvdiffhostrate', 'dsthostcount',
       'dsthostsrvcount', 'dsthostsamesrvrate', 'dsthostdiffsrvrate', 'dsthostsamesrcportrate',
       'dsthostsrvdiffhostrate', 'dsthostserrorrate', 'dsthostsrvserrorrate', 'dsthostrerrorrate',
       'dsthostsrvrerrorrate', 'lastflag'],
      dtype='object')

In [37]:
num_cols = df.drop('numoutboundcmds',axis=1).select_dtypes(include=['int64','float64']).columns
visited = set()

ndf = pd.DataFrame(columns=['var1','var2','pearson_corr','spearman_corr','pvalue_pearson','pvalue_spearman','status_pearson','status_spearman'])
for col1 in num_cols:
  for col2 in num_cols:
    if (col1 != col2) and (col2 not in visited):
      status1,status2 = 0,0
      pearson_corr, pearson_p = pearsonr(df[col1].dropna(), df[col2].dropna())
      spearman_corr, spearman_p = spearmanr(df[col1].dropna(), df[col2].dropna())

      if pearson_p <= 0.05:
        status1 = 1

      if spearman_p <= 0.05:
        status2 = 1

      ndf.loc[len(ndf)] = {'var1':col1,'var2':col2,
                           'pearson_corr':pearson_corr,'spearman_corr':spearman_corr,
                           'pvalue_pearson':pearson_p,'pvalue_spearman':spearman_p,
                           'status_pearson':status1,'status_spearman':status2}
  visited.add(col1)


In [38]:
ndf.shape

(378, 8)

In [39]:
ndf[np.logical_and(ndf['status_pearson'] == 1, ndf['status_spearman'] == 1)].shape

(294, 8)

In [40]:
ndf[np.logical_and(ndf['status_pearson'] == 0, ndf['status_spearman'] == 0)].shape

(9, 8)

In [41]:
ndf[np.logical_and(ndf['status_pearson'] == 0, ndf['status_spearman'] == 1)].shape

(72, 8)

In [42]:
ndf[np.logical_and(ndf['status_pearson'] == 1, ndf['status_spearman'] == 0)].shape

(3, 8)

In [43]:
significant_results = ndf[(ndf['pvalue_pearson'] < 0.05) | (ndf['pvalue_spearman'] < 0.05)].copy()

significant_results.reset_index(inplace=True)
significant_results['pearson_result'] = None
significant_results['spearman_result'] = None

for i in range(len(significant_results)):
  scorr = significant_results.loc[i,'spearman_corr']
  pcorr = significant_results.loc[i,'pearson_corr']

  if scorr >= 0.7:
    significant_results.loc[i,'spearman_result'] = 'Strong'
  elif scorr >= 0.4 and scorr < 0.7:
    significant_results.loc[i,'spearman_result'] = 'Moderate'
  else:
    significant_results.loc[i,'spearman_result'] = 'Weak'


  if pcorr >= 0.7:
    significant_results.loc[i,'pearson_result'] = 'Strong'
  elif pcorr >= 0.4 and pcorr < 0.7:
    significant_results.loc[i,'pearson_result'] = 'Moderate'
  else:
    significant_results.loc[i,'pearson_result'] = 'Weak'

In [44]:
significant_results['pearson_result'].value_counts()

Unnamed: 0_level_0,count
pearson_result,Unnamed: 1_level_1
Weak,342
Strong,16
Moderate,11


In [45]:
significant_results['spearman_result'].value_counts()

Unnamed: 0_level_0,count
spearman_result,Unnamed: 1_level_1
Weak,318
Moderate,34
Strong,17


In [46]:
# import itertools
# categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

# results_categorical_categorical = []

# # Categorical vs Categorical: Chi-Squared test
# for col1, col2 in itertools.combinations(categorical_cols, 2):
#     contingency_table = pd.crosstab(df[col1], df[col2])
#     if (contingency_table < 5).values.any():
#         # Permutation Chi-Square if counts < 5
#         observed_stat = chi2_contingency(contingency_table, correction=False)[0]
#         permutations = []
#         for _ in range(1000):
#             shuffled = df[col2].sample(frac=1).reset_index(drop=True)
#             permuted_table = pd.crosstab(df[col1], shuffled)
#             stat, _ = chi2_contingency(permuted_table, correction=False)[:2]
#             permutations.append(stat)
#         perm_p = np.mean([1 if stat >= observed_stat else 0 for stat in permutations])
#         chi_p = perm_p
#     else:
#         # Regular Chi-Square Test
#         chi_p = chi2_contingency(contingency_table, correction=False)[1]

#     results_categorical_categorical.append({
#         'Field1': col1,
#         'Field2': col2,
#         'Test': 'Categorical-Categorical',
#         'Chi_Square_p': chi_p,
#         'Important': chi_p < 0.05
#     })

# # Convert results to DataFrames
# df_categorical_categorical = pd.DataFrame(results_categorical_categorical)
# print(df_categorical_categorical)