# Importing libraries, loading data

In [None]:
# import os
import pandas as pd

The basic data structure in pandas is a "dataframe".

In [None]:
# Read the CSV file into a DataFrame
Cloud_data = pd.read_csv("vmCloud_data.csv")

In [None]:
Cloud_data.shape

In [None]:
list01 = [1, 7, 6, 4, 8, 24, -56, 78]

In [None]:
type(list01)

In [None]:
for x in list01:
    print (x*2)

In [None]:
# vector01 = ai + bj + ck
# 2 * vector01 = 2(ai + bj + ck)
# 2 * vector01 = 2ai + 2bj + 2ck

In [None]:
list_df = pd.DataFrame(list01)

In [None]:
type(list_df)

In [None]:
list_df * 2

In [None]:
Cloud_data["cpu_usage"]

In [None]:
Cloud_data["cpu_usage"] * 2

# Exploring the data

## Preliminary Analyses

In [None]:
# Cloud_data = Cloud_data[:100000]

In [None]:
Cloud_data.columns

In [None]:
Cloud_data.head()

In [None]:
Cloud_data.info()

In [None]:
Cloud_data.isnull().sum()

In [None]:
Cloud_data["timestamp"]

In [None]:
Cloud_data["vm_id"]

In [None]:
Cloud_data = Cloud_data.dropna()

In [None]:
Cloud_data.isnull().sum()

In [None]:
Cloud_data.shape

## Let's plot some preliminary graphs to see the data

In [None]:
import matplotlib.pyplot as plt

In [None]:
Cloud_data['network_traffic'][0:10]

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(Cloud_data['timestamp'][:50], Cloud_data['cpu_usage'][:50], label='CPU Usage')
plt.plot(Cloud_data['timestamp'][:50], Cloud_data['memory_usage'][:50], label='Memory Usage')
plt.plot(Cloud_data['timestamp'][:50], Cloud_data['power_consumption'][:50], label='Power Consumption')
plt.xlabel('Timestamp')
plt.legend()
plt.show()

## Dealing with Timestamps

In [None]:
Cloud_data.info()

In [None]:
#coveting col to datetime
Cloud_data['timestamp'] = pd.to_datetime(Cloud_data['timestamp'])
Cloud_data['hour_of_day'] = Cloud_data['timestamp'].dt.hour
Cloud_data['day_of_week'] = Cloud_data['timestamp'].dt.dayofweek  # Monday=0, Sunday=6
Cloud_data['timestamp']

In [None]:
Cloud_data.sort_values('timestamp', inplace=True)

In [None]:
Cloud_data

In [None]:
#historical ussage patterns
Cloud_data['cpu_usage_7d_avg'] = Cloud_data['cpu_usage'].rolling(window=7, min_periods=1).mean()

Cloud_data['memory_usage_7d_avg'] = Cloud_data['memory_usage'].rolling(window=7, min_periods=1).mean()

#check
print(Cloud_data.head())

In [None]:
#checking whether historical ussage patterns are calculated or not
#Check for NaN Values
print("NaN values in 'cpu_usage_7d_avg':", Cloud_data['cpu_usage_7d_avg'].isnull().sum())
print("NaN values in 'memory_usage_7d_avg':", Cloud_data['memory_usage_7d_avg'].isnull().sum())

# Visual Inspection
print("\nFirst few rows for visual inspection:")
print(Cloud_data.head())
print("\nLast few rows for visual inspection:")
print(Cloud_data.tail())

# Time Series Analysis

## Trend Analysis

In [None]:
list01 = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21]

In [None]:
list01[::2]

In [None]:
# Plotting CPU Usage and its 7-day rolling avg
plt.figure(figsize=(10, 6))
plt.plot(Cloud_data['timestamp'][::5000], Cloud_data['cpu_usage'][::5000], label='CPU Usage')
plt.plot(Cloud_data['timestamp'][::5000], Cloud_data['cpu_usage_7d_avg'][::5000], label='7-Day Rolling Avg of CPU Usage', linestyle='--')
plt.xlabel('Timestamp')
plt.ylabel('CPU Usage')
plt.title('CPU Usage and 7-Day Rolling Average')
plt.legend()
plt.show()

In [None]:
# Plotting the calculations

#plotting the memory ussage and its 7-day rolling avg
plt.figure(figsize=(10, 6))
plt.plot(Cloud_data['timestamp'][::5000], Cloud_data['memory_usage'][::5000], label='Memory Usage')
plt.plot(Cloud_data['timestamp'][::5000], Cloud_data['memory_usage_7d_avg'][::5000], label='7-Day Rolling Avg of Memory Usage', linestyle='--')
plt.xlabel('Timestamp')
plt.ylabel('Memory Usage')
plt.title('Memory Usage and 7-Day Rolling Average')
plt.legend()
plt.show()

## Seasonality Analysis

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(Cloud_data['timestamp'][:100000:500], Cloud_data['cpu_usage'][:100000:500], label='CPU Usage')
plt.plot(Cloud_data['timestamp'][:100000:500], Cloud_data['memory_usage'][:100000:500], label='Memory Usage')
plt.plot(Cloud_data['timestamp'][:100000:500], Cloud_data['power_consumption'][:100000:500], label='Power Consumption')
plt.xlabel('Timestamp')
plt.legend()
plt.show()


Shall we try by changing the intervals\?

# Data Preprocessing and Cleaning

## Scaling the data

In [None]:
from sklearn.preprocessing import MinMaxScaler

Cloud_data.fillna(Cloud_data.mean(numeric_only=True), inplace=True)

# missing values and stuff
for column in ['task_type', 'task_priority', 'task_status']:
    Cloud_data[column] = Cloud_data[column].fillna(Cloud_data[column].mode()[0])
    
columns_to_normalize = ['cpu_usage', 'memory_usage', 'network_traffic', 'power_consumption', 'num_executed_instructions', 'execution_time', 'energy_efficiency']

#using the min max scaler
scaler = MinMaxScaler()

Cloud_data[columns_to_normalize] = scaler.fit_transform(Cloud_data[columns_to_normalize])

#check 
print(Cloud_data.head())

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(Cloud_data['timestamp'][:10000:500], Cloud_data['cpu_usage'][:10000:500], label='CPU Usage')
plt.plot(Cloud_data['timestamp'][:10000:500], Cloud_data['memory_usage'][:10000:500], label='Memory Usage')
plt.plot(Cloud_data['timestamp'][:10000:500], Cloud_data['power_consumption'][:10000:500], label='Power Consumption')
plt.xlabel('Timestamp')
plt.legend()
plt.show()

In [None]:
# Check Data Types
print("\nData types of each column:")
print(Cloud_data.dtypes)

#Check Normalization
print("\nMin and Max values for normalized columns:")
for column in ['cpu_usage', 'memory_usage', 'network_traffic', 'power_consumption', 'num_executed_instructions', 'execution_time', 'energy_efficiency']:
    print(f"{column}: Min = {Cloud_data[column].min()}, Max = {Cloud_data[column].max()}")

#Inspection
# Display the first few rows of the DataFrame
print("\nFirst few rows for visual inspection:")
print(Cloud_data.head())

## Dividing by task_type

In [None]:
Cloud_data["task_type"].value_counts()

In [None]:
chosen_task = 'io'

In [None]:
Cloud_data = Cloud_data[Cloud_data["task_type"]==chosen_task]

In [None]:
Cloud_data = Cloud_data.drop(["task_type"], axis=1)

In [None]:
Cloud_data.head()

In [None]:
Cloud_data.tail()

Time series analysis of this data can yield some results. 
1) Level: The base value for the series if it were a straight line.
2) Trend: The linear increasing or decreasing behavior of the series over time.
3) Seasonality: The repeating patterns or cycles of behavior over time.
4) Noise: The variability in the observations that cannot be explained by the model.

All-time series generally have a level, noise, while trend and seasonality are optional.

The main features of many time series are trends and seasonal variation. Another feature of most time series is that observations close together in time tend to be correlated.

In [None]:
# Plotting CPU Usage and its 7-day rolling avg
plt.figure(figsize=(10, 6))
plt.plot(Cloud_data['timestamp'][::5000], Cloud_data['cpu_usage'][::5000], label='CPU Usage')
plt.plot(Cloud_data['timestamp'][::5000], Cloud_data['cpu_usage_7d_avg'][::5000], label='7-Day Rolling Avg of CPU Usage', linestyle='--')
plt.xlabel('Timestamp')
plt.ylabel('CPU Usage')
plt.title('CPU Usage and 7-Day Rolling Average')
plt.legend()
plt.show()

#plotting the memory ussage and its 7-day rolling avg
plt.figure(figsize=(10, 6))
plt.plot(Cloud_data['timestamp'][::5000], Cloud_data['memory_usage'][::5000], label='Memory Usage')
plt.plot(Cloud_data['timestamp'][::5000], Cloud_data['memory_usage_7d_avg'][::5000], label='7-Day Rolling Avg of Memory Usage', linestyle='--')
plt.xlabel('Timestamp')
plt.ylabel('Memory Usage')
plt.title('Memory Usage and 7-Day Rolling Average')
plt.legend()
plt.show()

## Train-test splitting and encoding

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [None]:
# Encode categorical variables
label_encoder = LabelEncoder()
Cloud_data['task_priority_encoded'] = label_encoder.fit_transform(Cloud_data['task_priority'])
Cloud_data['task_status_encoded'] = label_encoder.fit_transform(Cloud_data['task_status'])

# Drop original categorical columns and 'timestamp'
Cloud_data_processed = Cloud_data.drop(['timestamp', 'task_priority', 'task_status'], axis=1)

In [None]:
# Features (excluding target variables and 'vm_id' if it's not used as a feature)
X = Cloud_data_processed.drop(['cpu_usage', 'memory_usage', 'network_traffic', 'vm_id'], axis=1)

# Targets
y = Cloud_data_processed[['cpu_usage', 'memory_usage', 'network_traffic']]

In [None]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Checking the shape of the splits
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

In [None]:
print("Feature names:", X_train.columns.tolist())

# Training a random forest regressor model

In [None]:

from sklearn.ensemble import RandomForestRegressor


predictions_rf = {}

# Train RandomForest model for each target
for target in y_train.columns:
    # RandomForest
    model_rf = RandomForestRegressor(n_estimators=10, random_state=42)
    model_rf.fit(X_train, y_train[target])
    # Assuming you want to keep the RandomForest models as well, you could store them similarly to models_lgbm
    predictions_rf[target] = model_rf.predict(X_test)

# Example evaluation with RMSE for combined predictions
from sklearn.metrics import mean_squared_error
import numpy as np

for target in y_train.columns:
    # rmse = np.sqrt(mean_squared_error(y_test[target], y_pred_combined_df[target]))
    rmse = np.sqrt(mean_squared_error(y_test[target], predictions_rf[target]))
    print(f"RMSE for {target}: {rmse}")


----------------------