In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind,chi2_contingency,levene,MonteCarloMethod,PermutationMethod,shapiro,f_oneway,kruskal,pearsonr,spearmanr

In [2]:
sns.set_style('darkgrid')
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)  # Ensures that the display width isn't restricted
pd.set_option('display.max_rows', None)

In [3]:
df = pd.read_csv('Network_anomaly_data.csv')
df['is_attack'] = df['attack'].apply(lambda x: 'normal' if x=='normal' else 'attack')
df.head()

Unnamed: 0,duration,protocoltype,service,flag,srcbytes,dstbytes,land,wrongfragment,urgent,hot,numfailedlogins,loggedin,numcompromised,rootshell,suattempted,numroot,numfilecreations,numshells,numaccessfiles,numoutboundcmds,ishostlogin,isguestlogin,count,srvcount,serrorrate,srvserrorrate,rerrorrate,srvrerrorrate,samesrvrate,diffsrvrate,srvdiffhostrate,dsthostcount,dsthostsrvcount,dsthostsamesrvrate,dsthostdiffsrvrate,dsthostsamesrcportrate,dsthostsrvdiffhostrate,dsthostserrorrate,dsthostsrvserrorrate,dsthostrerrorrate,dsthostsrvrerrorrate,attack,lastflag,is_attack
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,150,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20,normal
1,0,udp,other,SF,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,1,0.0,0.0,0.0,0.0,0.08,0.15,0.0,255,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15,normal
2,0,tcp,private,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,123,6,1.0,1.0,0.0,0.0,0.05,0.07,0.0,255,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19,attack
3,0,tcp,http,SF,232,8153,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,5,5,0.2,0.2,0.0,0.0,1.0,0.0,0.0,30,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21,normal
4,0,tcp,http,SF,199,420,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,30,32,0.0,0.0,0.0,0.0,1.0,0.0,0.09,255,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21,normal


In [4]:
# land, wrongfragment,urgent, numfailedlogins,loggedin,rootshell,suattempted,numshells,ishostlogin,isguestlogin
df['land'] = df['land'].astype('category')
df['wrongfragment'] = df['wrongfragment'].astype('category')
df['urgent'] = df['urgent'].astype('category')
df['numfailedlogins'] = df['numfailedlogins'].astype('category')
df['loggedin'] = df['loggedin'].astype('category')
df['rootshell'] = df['rootshell'].astype('category')
df['suattempted'] = df['suattempted'].astype('category')
df['numshells'] = df['numshells'].astype('category')
df['ishostlogin'] = df['ishostlogin'].astype('category')
df['isguestlogin'] = df['isguestlogin'].astype('category')

In [5]:
num_cols = df.drop('numoutboundcmds',axis=1).select_dtypes(include=['int64','float64']).columns
visited = set()

ndf = pd.DataFrame(columns=['var1','var2','pearson_corr','spearman_corr','pvalue_pearson','pvalue_spearman','status_pearson','status_spearman'])
for col1 in num_cols:
  for col2 in num_cols:
    if (col1 != col2) and (col2 not in visited):
      status1,status2 = 0,0
      pearson_corr, pearson_p = pearsonr(df[col1].dropna(), df[col2].dropna())
      spearman_corr, spearman_p = spearmanr(df[col1].dropna(), df[col2].dropna())

      if pearson_p <= 0.05:
        status1 = 1

      if spearman_p <= 0.05:
        status2 = 1

      ndf.loc[len(ndf)] = {'var1':col1,'var2':col2,
                           'pearson_corr':pearson_corr,'spearman_corr':spearman_corr,
                           'pvalue_pearson':pearson_p,'pvalue_spearman':spearman_p,
                           'status_pearson':status1,'status_spearman':status2}
  visited.add(col1)


In [6]:
significant_results = ndf[(ndf['pvalue_pearson'] < 0.05) | (ndf['pvalue_spearman'] < 0.05)].copy()

significant_results.reset_index(inplace=True)
significant_results['pearson_result'] = None
significant_results['spearman_result'] = None

for i in range(len(significant_results)):
  scorr = significant_results.loc[i,'spearman_corr']
  pcorr = significant_results.loc[i,'pearson_corr']

  if scorr >= 0.7:
    significant_results.loc[i,'spearman_result'] = 'Strong'
  elif scorr >= 0.4 and scorr < 0.7:
    significant_results.loc[i,'spearman_result'] = 'Moderate'
  else:
    significant_results.loc[i,'spearman_result'] = 'Weak'


  if pcorr >= 0.7:
    significant_results.loc[i,'pearson_result'] = 'Strong'
  elif pcorr >= 0.4 and pcorr < 0.7:
    significant_results.loc[i,'pearson_result'] = 'Moderate'
  else:
    significant_results.loc[i,'pearson_result'] = 'Weak'

In [7]:
significant_results.shape

(369, 11)

In [8]:
significant_results['pearson_result'].value_counts()

Unnamed: 0_level_0,count
pearson_result,Unnamed: 1_level_1
Weak,342
Strong,16
Moderate,11


In [9]:
significant_results['spearman_result'].value_counts()

Unnamed: 0_level_0,count
spearman_result,Unnamed: 1_level_1
Weak,318
Moderate,34
Strong,17


In [10]:
pd.pivot_table(data=significant_results,index='pearson_result',columns='spearman_result',values='var1',aggfunc='count')

spearman_result,Moderate,Strong,Weak
pearson_result,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Moderate,9.0,,2.0
Strong,1.0,14.0,1.0
Weak,24.0,3.0,315.0


In [11]:
cat_cols = df.select_dtypes(include=['object','category']).columns
cat_cols = cat_cols[0:-2]
cat_cols

Index(['protocoltype', 'service', 'flag', 'land', 'wrongfragment', 'urgent', 'numfailedlogins',
       'loggedin', 'rootshell', 'suattempted', 'numshells', 'ishostlogin', 'isguestlogin'],
      dtype='object')

In [12]:
def permutation_test_independence_cat(obs_table, num_permutations=10000):
    """
    Perform a permutation test of independence on a contingency table.

    Parameters:
    - obs_table: 2D numpy array (contingency table).
    - num_permutations: Number of permutations to perform.

    Returns:
    - p_value: The p-value of the test.
    """
    # Compute the observed chi-squared statistic
    observed_stat, _, _, _ = chi2_contingency(obs_table)

    # Get the shape of the contingency table
    num_rows, num_cols = obs_table.shape

    # Perform permutations
    permuted_stats = []
    for _ in range(num_permutations):
        # Shuffle the rows of the contingency table
        permuted_table = np.copy(obs_table)
        for i in range(num_cols):
            np.random.shuffle(permuted_table[:, i])  # Shuffle each column independently

        # Compute the chi-squared statistic for the permuted table
        try:
            permuted_stat, _, _, _ = chi2_contingency(permuted_table)
            permuted_stats.append(permuted_stat)
        except ValueError:
            # If Chi-squared cannot be computed (e.g., due to zero counts), skip this permutation
            continue

    # Convert the list of permuted statistics to a numpy array
    permuted_stats = np.array(permuted_stats)

    # Compute the p-value: proportion of permuted statistics >= observed statistic
    p_value = np.mean(permuted_stats >= observed_stat)
    avg_stat = np.mean(permuted_stats)

    return p_value,avg_stat


In [13]:
def cramers_v(chi2, n, contingency_table):
    phi2 = chi2 / n
    r, k = contingency_table.shape
    phi2corr = max(0, phi2 - (k-1)*(r-1) / (n-1))
    k_corr = min(k-1, r-1)
    return np.sqrt(phi2corr / k_corr)

In [14]:
categorical_df = pd.DataFrame(columns=['var1','var2','test_type','pvalue','stat','effect','crammers_v'])
visited = set()
for col1 in cat_cols:
  for col2 in cat_cols:
    if (col1 != col2) and (col2 not in visited):
      ctbl = pd.crosstab(df[col1],df[col2])
      l5 = np.any(ctbl<5)
      if not l5:
        chi_stat, p_value, dof, exp_freq = chi2_contingency(ctbl)
        crammers_v = cramers_v(chi_stat,df.shape[0],ctbl)
        if p_value <= 0.05:
          categorical_df.loc[len(categorical_df)] = {'var1':col1,'var2':col2,'test_type':'Chi_Sqr','pvalue':p_value,'stat':chi_stat,'effect':0,'crammers_v':crammers_v}
        else:
          categorical_df.loc[len(categorical_df)] = {'var1':col1,'var2':col2,'test_type':'Chi_Sqr','pvalue':p_value,'stat':chi_stat,'effect':1,'crammers_v':crammers_v}
      else:
        p_value,avg_stat = permutation_test_independence_cat(ctbl)
        #stat, p_value = permutation_test(df, col1, col2)
        crammers_v = cramers_v(avg_stat,df.shape[0],ctbl)
        if p_value <= 0.05:
          categorical_df.loc[len(categorical_df)] = {'var1':col1,'var2':col2,'test_type':'Chi_Sqr_Permutation','pvalue':p_value,'stat':avg_stat,'effect':0,'crammers_v':crammers_v}
        else:
          categorical_df.loc[len(categorical_df)] = {'var1':col1,'var2':col2,'test_type':'Chi_Sqr_Permutation','pvalue':p_value,'stat':avg_stat,'effect':1,'crammers_v':crammers_v}
  visited.add(col1)

In [15]:
categorical_df['result'] = None
for i in range(len(categorical_df)):
  cv = categorical_df.loc[i,'crammers_v']
  if cv < 0.1:
    categorical_df.loc[i,'result'] = 'Weak'
  elif cv >= 0.1 and cv < 0.3:
    categorical_df.loc[i,'result'] = 'Moderate'
  elif cv >= 0.3 and cv < 0.5:
    categorical_df.loc[i,'result'] = 'Strong'
  else:
    categorical_df.loc[i,'result'] = 'Very Strong'

In [16]:
categorical_df['result'].value_counts()

Unnamed: 0_level_0,count
result,Unnamed: 1_level_1
Very Strong,36
Weak,16
Strong,14
Moderate,12


In [66]:
categorical_df.groupby('var1').agg(total = ('var2','count'),effect_total=('effect','sum'))

Unnamed: 0_level_0,total,effect_total
var1,Unnamed: 1_level_1,Unnamed: 2_level_1
flag,10,10
ishostlogin,1,1
land,9,9
loggedin,5,4
numfailedlogins,6,6
numshells,2,2
protocoltype,12,12
rootshell,4,4
service,11,11
suattempted,3,3


In [80]:
num_cols = df.drop('numoutboundcmds',axis=1).select_dtypes(include=['int64','float64']).columns
visited = set()

ndf = pd.DataFrame(columns=['var1','var2','pearson_corr','spearman_corr','pvalue_pearson','pvalue_spearman','status_pearson','status_spearman'])
for col1 in num_cols:
  for col2 in num_cols:
    if (col1 != col2):
      status1,status2 = 0,0
      pearson_corr, pearson_p = pearsonr(df[col1].dropna(), df[col2].dropna())
      spearman_corr, spearman_p = spearmanr(df[col1].dropna(), df[col2].dropna())

      if pearson_p <= 0.05:
        status1 = 1

      if spearman_p <= 0.05:
        status2 = 1

      ndf.loc[len(ndf)] = {'var1':col1,'var2':col2,
                           'pearson_corr':pearson_corr,'spearman_corr':spearman_corr,
                           'pvalue_pearson':pearson_p,'pvalue_spearman':spearman_p,
                           'status_pearson':status1,'status_spearman':status2}
  #visited.add(col1)


In [81]:
significant_results = ndf[(ndf['pvalue_pearson'] < 0.05) | (ndf['pvalue_spearman'] < 0.05)].copy()

significant_results.reset_index(inplace=True)
significant_results['pearson_result'] = None
significant_results['spearman_result'] = None

for i in range(len(significant_results)):
  scorr = significant_results.loc[i,'spearman_corr']
  pcorr = significant_results.loc[i,'pearson_corr']

  if scorr >= 0.7:
    significant_results.loc[i,'spearman_result'] = 'Strong'
  elif scorr >= 0.4 and scorr < 0.7:
    significant_results.loc[i,'spearman_result'] = 'Moderate'
  else:
    significant_results.loc[i,'spearman_result'] = 'Weak'


  if pcorr >= 0.7:
    significant_results.loc[i,'pearson_result'] = 'Strong'
  elif pcorr >= 0.4 and pcorr < 0.7:
    significant_results.loc[i,'pearson_result'] = 'Moderate'
  else:
    significant_results.loc[i,'pearson_result'] = 'Weak'

In [82]:
v1count = significant_results.groupby('var1')['var2'].count().reset_index()
pearson_count = pd.pivot_table(data=significant_results,index='var1',columns='pearson_result',values='var2',aggfunc='count',fill_value=0).reset_index()
vimp = pd.merge(v1count,pearson_count,on='var1')
vimp.sort_values('Strong',ascending=False)

Unnamed: 0,var1,var2,Moderate,Strong,Weak
27,srvserrorrate,27,1,3,23
22,serrorrate,27,1,3,23
20,rerrorrate,26,0,3,23
12,dsthostsrvserrorrate,27,2,3,22
11,dsthostsrvrerrorrate,26,0,3,23
8,dsthostserrorrate,26,2,3,21
5,dsthostrerrorrate,26,1,3,22
26,srvrerrorrate,27,0,3,24
7,dsthostsamesrvrate,27,0,2,25
9,dsthostsrvcount,27,0,2,25


pearson_result,var1,Moderate,Strong,Weak,All
0,count,6.0,,13.0,19
1,diffsrvrate,1.0,,11.0,12
2,dstbytes,,,25.0,25
3,dsthostcount,2.0,,8.0,10
4,dsthostdiffsrvrate,1.0,,6.0,7
5,dsthostrerrorrate,,1.0,1.0,2
6,dsthostsamesrcportrate,1.0,,5.0,6
7,dsthostsamesrvrate,,,8.0,8
8,dsthostserrorrate,,1.0,3.0,4
9,dsthostsrvcount,,1.0,8.0,9


In [44]:
# df1 = df.copy()
# groups = [df1[df1['attack'] == cat]['duration'] for cat in df1['attack'].unique()]
# kruskal_stat, p_value = kruskal(*groups)
# print(f"Kruskal-Wallis Test Statistic: {kruskal_stat}, P-value: {p_value}")

Kruskal-Wallis Test Statistic: 14501.58833252334, P-value: 0.0


In [60]:
# from math import e
# from scipy.stats import mannwhitneyu

# group_labels = df1['attack'].unique()
# p_values = []
# pairs = []
# visited_pair = set()
# mpdf = pd.DataFrame(columns=['var1','var2','p_value','effect'])

# for i, group1 in enumerate(group_labels):
#     for j, group2 in enumerate(group_labels):
#         if i < j and group_labels[j] not in visited_pair:
#             effect = 0
#             stat, p = mannwhitneyu(
#                 df1[df1['attack'] == group1]['duration'],
#                 df1[df1['attack'] == group2]['duration'],
#                 alternative='two-sided'
#             )
#             if p <= 0.05:
#               effect = 1
#             mpdf.loc[len(mpdf)] = {'var1':group_labels[i],'var2':group_labels[j],'p_value':p,'effect':effect}
#     visited_pair.add(group_labels[i])
