In [1]:
#importing required packages

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import linear_model
import statsmodels.api as sm
import seaborn as sns
from sklearn.metrics import r2_score
import math

import warnings
warnings.filterwarnings("ignore")



In [2]:
src_dest_df = pd.read_parquet('src_dest_df.parquet')
src_dest_df_1 = src_dest_df.loc[src_dest_df.cloud_geo_iso1 != 'AWS.ap-east-1'] 
# removing this from training so that you can use it for testing/validation later 

# Drop the non-transformed columns
src_dest_df_1 = src_dest_df_1.drop(columns = ['packet_loss_percent'])
src_dest_df_1 = src_dest_df_1.drop(columns = ['timestamp'])

src_dest_df.timestamp = pd.to_datetime(src_dest_df.timestamp)
src_dest_df.shape
src_dest_df.dtypes
src_dest_df_1.head(10)




Unnamed: 0,cloud_geo_iso1,cloud_geo_iso2,latency_ms
66,AWS.eu-west-1,AWS.ap-northeast-2,124.2
67,AWS.eu-west-1,AWS.ap-southeast-2,131.8
68,AWS.eu-west-1,AWS.ap-northeast-2,123.75
69,AWS.eu-west-1,AWS.eu-west-2,6.35
70,AWS.eu-west-1,AWS.us-west-1,72.35
71,AWS.eu-west-1,AWS.eu-north-1,23.75
72,AWS.eu-west-1,AWS.eu-central-1,14.55
73,AWS.eu-west-1,AWS.us-east-1,36.4
74,AWS.eu-west-1,AWS.ca-central-1,39.55
75,AWS.eu-west-1,AWS.eu-west-3,10.75


In [3]:
aws_region_df = pd.read_pickle('region_geo_lookup_aws.pkl')
aws_region_df.shape
aws_region_df.dtypes
aws_region_df

aws_region_df.region_name.unique()

array(['AWS.us-east-1', 'AWS.us-east-2', 'AWS.us-west-1', 'AWS.us-west-2',
       'AWS.eu-west-1', 'AWS.eu-west-2', 'AWS.eu-west-3',
       'AWS.eu-central-1', 'AWS.sa-east-1', 'AWS.ap-southeast-1',
       'AWS.ap-southeast-2', 'AWS.ap-northeast-1', 'AWS.ap-northeast-2',
       'AWS.ap-south-1', 'AWS.ca-central-1', 'AWS.eu-north-1',
       'AWS.me-south-1', 'AWS.ap-east-1'], dtype=object)

In [4]:
aws_region_df = pd.read_pickle('region_geo_lookup_aws.pkl')
aws_region_df.shape
aws_region_df.dtypes
aws_region_df

aws_region_df.region_name.unique()
print(aws_region_df)

           region_name cloud_service_provider                 city_name  \
0        AWS.us-east-1                    AWS                  Virginia   
1        AWS.us-east-2                    AWS                      Ohio   
2        AWS.us-west-1                    AWS                California   
3        AWS.us-west-2                    AWS                    Oregon   
4        AWS.eu-west-1                    AWS                   Ireland   
5        AWS.eu-west-2                    AWS                    London   
6        AWS.eu-west-3                    AWS                     Paris   
7     AWS.eu-central-1                    AWS                 Frankfurt   
8        AWS.sa-east-1                    AWS                 Sao Paulo   
9   AWS.ap-southeast-1                    AWS                 Singapore   
10  AWS.ap-southeast-2                    AWS                    Sydney   
11  AWS.ap-northeast-1                    AWS                     Tokyo   
12  AWS.ap-northeast-2   

In [8]:
#####----Random forest------

#src_dest_df, aws_region_df, src_dest_df_1



# ======================================================

# 1) Predict the latency from AWS.ap-east-1 to all other AWS regions in the src_dest_df.parquet

# =======================================================

In [232]:
# Our dataframes: 
#src_dest_df (Original DF), 
# aws_region_df, 
# src_dest_df_1(Dataframe without 'AWS.ap-east-1')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = src_dest_df_1

In [233]:
#Checking if there are any null values in the dataframe
data.isnull().sum()

cloud_geo_iso1    0
cloud_geo_iso2    0
latency_ms        0
dtype: int64

In [234]:
#checking null values in test set
test = src_dest_df.loc[src_dest_df.cloud_geo_iso1 == 'AWS.ap-east-1'] 
test = test.drop(columns=['timestamp', 'packet_loss_percent'])
test.isnull().sum()

cloud_geo_iso1    0
cloud_geo_iso2    0
latency_ms        0
dtype: int64

In [235]:
X = data
y = data['latency_ms']


In [236]:
#Doing the train test split

seed = 50  # so that the result is reproducible
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.333, random_state = seed)

In [237]:
X_train.dtypes

cloud_geo_iso1     object
cloud_geo_iso2     object
latency_ms        float64
dtype: object

In [238]:
print(X_test)

            cloud_geo_iso1      cloud_geo_iso2  latency_ms
276556      AWS.eu-north-1       AWS.us-east-1       55.75
184734       AWS.us-west-1       AWS.eu-west-2       71.05
47788        AWS.eu-west-2  AWS.ap-northeast-1      107.60
373753  AWS.ap-northeast-1       AWS.eu-west-2      112.60
224156       AWS.us-west-2  AWS.ap-southeast-2       71.75
...                    ...                 ...         ...
271864      AWS.ap-south-1       AWS.sa-east-1      153.30
245358      AWS.ap-south-1  AWS.ap-northeast-1       65.75
373533  AWS.ap-northeast-1  AWS.ap-northeast-2       17.75
136126       AWS.us-east-1    AWS.ca-central-1       10.25
428091  AWS.ap-northeast-2    AWS.eu-central-1      139.75

[164840 rows x 3 columns]


In [239]:
print(y_train)

307734     35.50
331128     62.45
225332     65.40
226787     35.95
336945     46.75
           ...  
441699     89.00
153775     31.70
239565     71.75
103970    156.15
407282     94.15
Name: latency_ms, Length: 330175, dtype: float64


In [240]:
features_to_encode = list(X_train.select_dtypes(include = ['object']).columns) 

In [241]:
# doing one-hot encoding to deal with categorical variables

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
col_trans = make_column_transformer(
                        (OneHotEncoder(handle_unknown = 'ignore'),features_to_encode),
                        remainder = "passthrough"
                        )

In [242]:
#applying random-forest
from sklearn.ensemble import RandomForestRegressor
rf_regressor = RandomForestRegressor(
                      min_samples_leaf=50,
                      n_estimators=150,
                      bootstrap=True,
                      oob_score=True,
                      n_jobs=-1,
                      random_state=seed,
                      max_features='auto')

In [243]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(col_trans, rf_regressor)
pipe.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['cloud_geo_iso1',
                                                   'cloud_geo_iso2'])])),
                ('randomforestregressor',
                 RandomForestRegressor(min_samples_leaf=50, n_estimators=150,
                                       n_jobs=-1, oob_score=True,
                                       random_state=50))])

In [248]:
#Doing the prediction on validation set

y_pred = pipe.predict(X_test)
print(y_pred)


[ 55.75        71.05       107.60339432 ...  17.74437093  10.25
 139.75      ]


In [245]:
# Doing the prediction on the test set (Our test set has no 'AWS.ap-east-1' region, 
# so we are testing our data by introducing new region)
y_pred = pipe.predict(test)
print(y_pred)
print(test['latency_ms'].to_list())


[156.53654943 155.39290217 153.86985689 155.51885838 155.51885838
 115.88488703 155.51885838 155.51885838 154.05898535 115.80011723
 115.86549501 115.42645111 114.99882357 114.90667001 155.51885838
 115.26898829 115.34897371 114.61342007 116.40876851 113.50346066
 115.42645111 153.06686397 155.51885838 114.61342007 154.29083141
 155.51885838 115.42645111 156.27491309 114.99882357 115.60453605
 153.80034068 155.51885838 155.51885838 153.9606071  114.62032293
 115.86549501 114.92010893 114.9247882  154.29083141 153.33788366
 154.29083141 153.22534867 116.40876851 115.88488703 115.26898829
 115.20307283 116.40876851 118.45782215 113.61071223 114.61342007
 153.86985689 155.99327253 114.54001076 153.54968517 155.51885838
 115.42645111 155.51885838 117.03326442 115.60453605 116.35350097
 156.03178791 154.29083141 155.51885838 156.72267873 153.33788366
 153.9606071 ]
[156.55, 154.6, 153.85, 155.6, 155.0, 115.9, 155.4, 155.1, 154.05, 115.8, 115.85, 115.45, 115.0, 114.85, 155.45, 115.25, 115.35

In [44]:
# Finding the accuracy of the model for test on 'AWS.ap-east-1' region
from sklearn import metrics
from sklearn.metrics import r2_score

print('R squared:', r2_score(test['latency_ms'], y_pred))
print('Mean Absolute Error:', metrics.mean_absolute_error(test['latency_ms'], y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(test['latency_ms'], y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(test['latency_ms'], y_pred)))

R squared: 0.9998486294774691
Mean Absolute Error: 0.10602017901887491
Mean Squared Error: 0.05847507885412941
Root Mean Squared Error: 0.24181620883251273


# =========================================================