In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind,chi2_contingency,levene,MonteCarloMethod,PermutationMethod,shapiro,f_oneway,kruskal,fisher_exact

In [2]:
sns.set_style('darkgrid')
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)  # Ensures that the display width isn't restricted
pd.set_option('display.max_rows', None)

In [3]:
df = pd.read_csv('Network_anomaly_data.csv')
df['is_attack'] = df['attack'].apply(lambda x: 'normal' if x=='normal' else 'attack')
df.head()

Unnamed: 0,duration,protocoltype,service,flag,srcbytes,dstbytes,land,wrongfragment,urgent,hot,numfailedlogins,loggedin,numcompromised,rootshell,suattempted,numroot,numfilecreations,numshells,numaccessfiles,numoutboundcmds,ishostlogin,isguestlogin,count,srvcount,serrorrate,srvserrorrate,rerrorrate,srvrerrorrate,samesrvrate,diffsrvrate,srvdiffhostrate,dsthostcount,dsthostsrvcount,dsthostsamesrvrate,dsthostdiffsrvrate,dsthostsamesrcportrate,dsthostsrvdiffhostrate,dsthostserrorrate,dsthostsrvserrorrate,dsthostrerrorrate,dsthostsrvrerrorrate,attack,lastflag,is_attack
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,150,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20,normal
1,0,udp,other,SF,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,1,0.0,0.0,0.0,0.0,0.08,0.15,0.0,255,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15,normal
2,0,tcp,private,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,123,6,1.0,1.0,0.0,0.0,0.05,0.07,0.0,255,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19,attack
3,0,tcp,http,SF,232,8153,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,5,5,0.2,0.2,0.0,0.0,1.0,0.0,0.0,30,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21,normal
4,0,tcp,http,SF,199,420,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,30,32,0.0,0.0,0.0,0.0,1.0,0.0,0.09,255,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21,normal


In [4]:
# land, wrongfragment,urgent, numfailedlogins,loggedin,rootshell,suattempted,numshells,ishostlogin,isguestlogin
df['land'] = df['land'].astype('category')
df['wrongfragment'] = df['wrongfragment'].astype('category')
df['urgent'] = df['urgent'].astype('category')
df['numfailedlogins'] = df['numfailedlogins'].astype('category')
df['loggedin'] = df['loggedin'].astype('category')
df['rootshell'] = df['rootshell'].astype('category')
df['suattempted'] = df['suattempted'].astype('category')
df['numshells'] = df['numshells'].astype('category')
df['ishostlogin'] = df['ishostlogin'].astype('category')
df['isguestlogin'] = df['isguestlogin'].astype('category')

In [5]:
tst_cols = df.drop(['attack','is_attack','numoutboundcmds'],axis=1).columns
tst_cols

Index(['duration', 'protocoltype', 'service', 'flag', 'srcbytes', 'dstbytes', 'land',
       'wrongfragment', 'urgent', 'hot', 'numfailedlogins', 'loggedin', 'numcompromised',
       'rootshell', 'suattempted', 'numroot', 'numfilecreations', 'numshells', 'numaccessfiles',
       'ishostlogin', 'isguestlogin', 'count', 'srvcount', 'serrorrate', 'srvserrorrate',
       'rerrorrate', 'srvrerrorrate', 'samesrvrate', 'diffsrvrate', 'srvdiffhostrate',
       'dsthostcount', 'dsthostsrvcount', 'dsthostsamesrvrate', 'dsthostdiffsrvrate',
       'dsthostsamesrcportrate', 'dsthostsrvdiffhostrate', 'dsthostserrorrate',
       'dsthostsrvserrorrate', 'dsthostrerrorrate', 'dsthostsrvrerrorrate', 'lastflag'],
      dtype='object')

In [20]:
def check_normality_no_sample(df,cat_var,numeric_var):
  groups = df[cat_var].unique()
  normality_results = dict()

  for group in groups:
    data = df[df[cat_var] == group][numeric_var]
    if len(data) >= 5000:
      data = data.sample(200)
    else:
      data = data.sample(200,replace=True)
    stats,p_value = shapiro(data)
    normality_results[group] = p_value
  return normality_results

In [21]:
def check_homogenitity(df,cat_var,numeric_var):
  groups = [df[df[cat_var] == group][numeric_var] for group in df[cat_var].unique()]
  stat, p_value = levene(*groups)
  return p_value

In [40]:
alpha = 0.05

binary_test = pd.DataFrame(columns=['var1','var2','var2type','Variance','pvalue_ttest','status'])

for col in tst_cols:
  col_type = df[col].dtype
  if col_type == 'int64' or col_type == 'float64':
    grp1 = df[df['is_attack'] == 'normal'][col]
    grp2 = df[df['is_attack'] != 'normal'][col]

    levene_stat, levene_p_value = levene(grp1, grp2)
    test_type = None
    equal_variance = True
    if levene_p_value > 0.05:
      pass
    else:
      equal_variance = False

    t_stat, pvalue = ttest_ind(grp1, grp2,equal_var=equal_variance)
    if pvalue <= 0.05:
      binary_test.loc[len(binary_test)] = {'var1':'is_attack','var2':col,'var2type':'numeric','Variance':equal_variance,'pvalue_ttest':pvalue,'status':1}
    else:
      binary_test.loc[len(binary_test)] = {'var1':'is_attack','var2':col,'var2type':'numeric','Variance':equal_variance,'pvalue_ttest':pvalue,'status':0}


In [41]:
alpha = 0.05
multi_test = pd.DataFrame(columns=['var1','var2','var2type','normality','homogenity','pvalue_anova','pvalue_kruskal'])
for col in tst_cols:
  col_type = df[col].dtype
  if col_type == 'int64' or col_type == 'float64':
    rdict = check_normality_no_sample(df,'attack',col)
    normality_exists=True
    equal_vars = True
    for value in rdict.values():
      if value < 0.05:
        normality_exists = False
        break
    levene_pval = check_homogenitity(df,'attack',col)
    if levene_pval < 0.05:
      equal_vars = False

    groups = [group[col].values for _, group in df.groupby('attack')]
    stat, p_value_1 = f_oneway(*groups)
    stat, p_value_2 = kruskal(*groups)
    multi_test.loc[len(multi_test)] = {'var1':'attack','var2':col,'var2type':col_type,'normality':normality_exists,'homogenity':equal_vars,'pvalue_anova':p_value_1,'pvalue_kruskal':p_value_2}


  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hyp

In [51]:
m1 = multi_test.drop(['var1','var2type'],axis=1)
b1 = binary_test.drop(['var1','var2type','status'],axis=1)

op = pd.merge(b1,m1,on='var2',how='inner')
# op['pvalue_ttest'] = op['pvalue_ttest'].apply(lambda x: f"{x:.10f}")
# op['pvalue_anova'] = op['pvalue_anova'].apply(lambda x: f"{x:.10f}")
# op['pvalue_kruskal'] = op['pvalue_kruskal'].apply(lambda x: f"{x:.10f}")
op

Unnamed: 0,var2,Variance,pvalue_ttest,normality,homogenity,pvalue_anova,pvalue_kruskal
0,duration,False,1.171722e-60,False,False,0.0,0.0
1,srcbytes,False,0.04980998,False,False,6.606143e-30,0.0
2,dstbytes,True,0.1439016,False,False,1.059029e-13,0.0
3,hot,False,2.633775e-06,False,False,0.0,0.0
4,numcompromised,False,0.0001047923,False,True,0.8360462,0.0
5,numroot,False,1.323059e-05,False,True,0.6587273,0.0
6,numfilecreations,False,8.191489e-16,False,False,1.028538e-73,0.0
7,numaccessfiles,False,3.7975279999999997e-44,False,False,2.9650960000000005e-157,0.0
8,count,False,0.0,False,False,0.0,0.0
9,srvcount,False,0.7891723,False,False,0.0,0.0


In [68]:
attack_df = pd.DataFrame()
for col in op.loc[9:,'var2'].values:
  means = pd.DataFrame(df.groupby('attack')[col].mean()).T
  attack_df = pd.concat([attack_df,means],axis=0)

attack_df['max_value'] = attack_df.max(axis=1)
attack_df['min_value'] = attack_df.min(axis=1)
attack_df['diff'] = attack_df['max_value'] - attack_df['min_value']

In [70]:
attack_df['diff'].mean()

47.695043558849164

In [100]:
# numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
# categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

# # Exclude target variables
# numeric_cols = [col for col in numeric_cols if col not in ['attack', 'is_attack']]
# categorical_cols = [col for col in categorical_cols if col not in ['attack', 'is_attack']]

# results = []

# # Hypothesis Testing Loop
# for col in df.columns:
#     if col not in ['attack', 'is_attack','numoutboundcmds']:
#         if col in numeric_cols:
#             # Numeric fields: Perform ANOVA and Kruskal-Wallis
#             groups = [df.loc[df['attack'] == label, col].dropna() for label in df['attack'].unique()]
#             try:
#                 anova_p = f_oneway(*groups).pvalue
#             except:
#                 anova_p = np.nan  # If ANOVA fails

#             kruskal_p = kruskal(*groups).pvalue
#             results.append({'Field': col, 'Test': 'Numeric', 'ANOVA_p': anova_p, 'Kruskal_p': kruskal_p})

#         elif col in categorical_cols:
#             # Categorical fields: Perform Chi-Squared
#             contingency_table = pd.crosstab(df[col], df['attack'])
#             if (contingency_table < 5).values.any():
#                 # Permutation test if cell counts < 5
#                 observed_stat = chi2_contingency(contingency_table, correction=False)[0]
#                 permutations = []
#                 for _ in range(1000):  # Adjust sample size if needed
#                     shuffled = df['attack'].sample(frac=1).reset_index(drop=True)
#                     permuted_table = pd.crosstab(df[col], shuffled)
#                     stat, _ = chi2_contingency(permuted_table, correction=False)[:2]
#                     permutations.append(stat)
#                 perm_p = np.mean([1 if stat >= observed_stat else 0 for stat in permutations])
#                 chi_p = perm_p
#             else:
#                 # Regular Chi-Squared test
#                 chi_p = chi2_contingency(contingency_table, correction=False)[1]

#             results.append({'Field': col, 'Test': 'Categorical', 'Chi_Square_p': chi_p})

# # Convert results to DataFrame
# results_df = pd.DataFrame(results)

# # Highlight important results
# results_df['Important'] = (results_df[['ANOVA_p', 'Kruskal_p', 'Chi_Square_p']] < 0.05).any(axis=1)
# print(results_df)

In [84]:
# num_results = results_df[results_df['Test'] == 'Numeric']

# op1 = op[['var2','Variance','normality','homogenity','pvalue_anova','pvalue_kruskal']]
# op1.columns = ['Field','Variance','normality','homogenity','pvalue_anova','pvalue_kruskal']

# final = pd.merge(op1,num_results[['Field','ANOVA_p','Kruskal_p','Important']],on='Field',how='inner')
# final = pd.merge(final,binary_test[['var2','pvalue_ttest','status']],left_on='Field',right_on='var2',how='inner')
# final.drop('var2',axis=1,inplace=True)
# final

In [91]:
# final[final['pvalue_anova'] == final['ANOVA_p']].shape
# final[final['pvalue_kruskal'] == final['Kruskal_p']].shape

(28, 9)