In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
import os



from statsmodels.tsa.stattools import grangercausalitytests
def grangers_causation_matrix(data, variables, test='ssr_chi2test', verbose=False, max_lag=15):

    df = pd.DataFrame(np.zeros((len(variables), len(variables))), columns=variables, index=variables)
    for c in df.columns:
        for r in df.index:
            test_result = grangercausalitytests(data[[r, c]], maxlag=max_lag, verbose=False)
            p_values = [round(test_result[i+1][0][test][1],4) for i in range(max_lag)]
            if verbose: print(f'Y = {r}, X = {c}, P Values = {p_values}')
            min_p_value = np.min(p_values)
            df.loc[r, c] = min_p_value
    df.columns = [var + '_x' for var in variables]
    df.index = [var + '_y' for var in variables]
    return df

path = '/content/drive/MyDrive/Veritas'
variable1 = 'CPU_usage'
variable2 = 'Canonical_memory_usage'
entries = os.listdir('/content/drive/MyDrive/Veritas')

for i in entries:
    print("\nMachine id: ", i[:len(i) - 4])
    data1 = pd.read_csv(os.path.join(path, i))

    # Check column names and data structure
    print("Column Names:", data1.columns)
    print("First Few Rows:")
    print(data1.head())

    # Check if the expected columns are present
    if variable1 in data1.columns and variable2 in data1.columns:
        data = data1[[variable1, variable2]].diff().dropna()
        # plt.plot(data)
        cpdatau_transform = data.diff().dropna()
        result = grangers_causation_matrix(data, variables=[variable1, variable2])
        print(result)
    else:
        print(f"Expected columns not found in the DataFrame for machine {i[:len(i) - 4]}.")





Machine id:  1119618555
Column Names: Index(['start-timestamp', 'finish-timestamp', 'CPU_usage',
       'Canonical_memory_usage', 'ass_mem', 'Maximum_memory_usage'],
      dtype='object')
First Few Rows:
   start-timestamp  finish-timestamp  CPU_usage  Canonical_memory_usage  \
0                6                 9   0.254094                0.150381   
1                9                12   0.189707                0.141421   
2               12                15   0.451202                0.146736   
3               15                18   0.437253                0.152443   
4               18                21   0.451050                0.192786   

    ass_mem  Maximum_memory_usage  
0  0.277387              0.213997  
1  0.250407              0.180382  
2  0.257291              0.192214  
3  0.283122              0.192437  
4  0.304505              0.279361  




                          CPU_usage_x  Canonical_memory_usage_x
CPU_usage_y                       1.0                    0.3775
Canonical_memory_usage_y          0.0                    1.0000

Machine id:  1093855
Column Names: Index(['start-timestamp', 'finish-timestamp', 'CPU_usage',
       'Canonical_memory_usage', 'ass_mem', 'Maximum_memory_usage'],
      dtype='object')
First Few Rows:
   start-timestamp  finish-timestamp  CPU_usage  Canonical_memory_usage  \
0                6                 9   0.131503                0.167309   
1                9                12   0.192258                0.190085   
2               12                15   0.154146                0.187093   
3               15                18   0.320007                0.182086   
4               18                21   0.092898                0.168956   

    ass_mem  Maximum_memory_usage  
0  0.201223              0.170010  
1  0.225686              0.198332  
2  0.223561              0.191493  
3  0.215653



                          CPU_usage_x  Canonical_memory_usage_x
CPU_usage_y                    1.0000                    0.0003
Canonical_memory_usage_y       0.0003                    1.0000

Machine id:  1094470
Column Names: Index(['start-timestamp', 'finish-timestamp', 'CPU_usage',
       'Canonical_memory_usage', 'ass_mem', 'Maximum_memory_usage'],
      dtype='object')
First Few Rows:
   start-timestamp  finish-timestamp  CPU_usage  Canonical_memory_usage  \
0                6                 9   0.155916                0.332927   
1                9                12   0.145386                0.330748   
2               12                15   0.148028                0.332465   
3               15                18   0.181101                0.334025   
4               18                21   0.147902                0.330885   

    ass_mem  Maximum_memory_usage  
0  0.420719              0.336332  
1  0.419073              0.336049  
2  0.420769              0.337210  
3  0.423547



                          CPU_usage_x  Canonical_memory_usage_x
CPU_usage_y                       1.0                       0.0
Canonical_memory_usage_y          0.0                       1.0

Machine id:  1094095
Column Names: Index(['start-timestamp', 'finish-timestamp', 'CPU_usage',
       'Canonical_memory_usage', 'ass_mem', 'Maximum_memory_usage'],
      dtype='object')
First Few Rows:
   start-timestamp  finish-timestamp  CPU_usage  Canonical_memory_usage  \
0                6                 9   0.071227                0.424213   
1                9                12   0.065407                0.403833   
2               12                15   0.078301                0.401547   
3               15                18   0.098071                0.406499   
4               18                21   0.087041                0.401808   

    ass_mem  Maximum_memory_usage  
0  0.471992              0.425149  
1  0.462972              0.423514  
2  0.460251              0.402092  
3  0.466740



                          CPU_usage_x  Canonical_memory_usage_x
CPU_usage_y                       1.0                       0.0
Canonical_memory_usage_y          0.0                       1.0

Machine id:  data description
Column Names: Index(['this folder contatin the workload of 5 machines from google cluster data running the highest number of tasks.'], dtype='object')
First Few Rows:
Empty DataFrame
Columns: [this folder contatin the workload of 5 machines from google cluster data running the highest number of tasks.]
Index: []
Expected columns not found in the DataFrame for machine data description.

Machine id:  1093209
Column Names: Index(['start-timestamp', 'finish-timestamp', 'CPU_usage',
       'Canonical_memory_usage', 'ass_mem', 'Maximum_memory_usage'],
      dtype='object')
First Few Rows:
   start-timestamp  finish-timestamp  CPU_usage  Canonical_memory_usage  \
0                6                 9   0.196190                0.233074   
1                9                12 



                          CPU_usage_x  Canonical_memory_usage_x
CPU_usage_y                    1.0000                    0.0001
Canonical_memory_usage_y       0.0001                    1.0000
