In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression


In [2]:
merged_df = pd.read_csv('data/cleaned_latency_data.csv')
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7954 entries, 0 to 7953
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   timestamp             7954 non-null   object 
 1   sat_id                7954 non-null   int64  
 2   sat_name              7954 non-null   object 
 3   latency_ms            7954 non-null   float64
 4   station_id            7954 non-null   object 
 5   station_name          7954 non-null   object 
 6   latitude              7954 non-null   float64
 7   longitude             7954 non-null   float64
 8   elevation_m           7954 non-null   int64  
 9   antenna_gain_dBi      7954 non-null   int64  
 10  f107_flux             7954 non-null   int64  
 11  ap_index              7954 non-null   int64  
 12  day_of_week           7954 non-null   int64  
 13  day_of_month          7954 non-null   int64  
 14  month                 7954 non-null   int64  
 15  hour                 

In [13]:
# Define Features and Target
X = merged_df.drop(columns=['latency_ms', 'timestamp', 'sat_name', 'station_id', 'station_name', 'sat_id'])
y = merged_df['latency_ms']
y

0       8.866683
1       8.097588
2       7.479486
3       7.470059
4       4.922811
          ...   
7949    6.760311
7950    5.507390
7951    3.447139
7952    6.584457
7953    6.669230
Name: latency_ms, Length: 7954, dtype: float64

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)


In [15]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Random Forest Baseline Performance:')
print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'R2 Score: {r2:.2f}')

Random Forest Baseline Performance:
Mean Squared Error (MSE): 3.92
R2 Score: -0.02


In [12]:
importances = pd.Series(rf_model.feature_importances_, index=X_train.columns)
importances_sorted = importances.sort_values(ascending=False)

print('Feature Importances:\n', importances_sorted)

Feature Importances:
 sat_id              0.640985
hour                0.187833
longitude           0.039877
latitude            0.037451
elevation_m         0.029264
antenna_gain_dBi    0.029213
days_since_start    0.011755
day_of_month        0.005716
ap_index            0.004732
f107_flux           0.004706
day_of_week         0.004558
is_weekend          0.003911
month               0.000000
dtype: float64


In [17]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)


In [18]:
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print('Linear Regression Baseline Performance:')
print(f'Mean Squared Error (MSE): {mse_lr:.2f}')
print(f'R2 Score: {r2_lr:2f}')

Linear Regression Baseline Performance:
Mean Squared Error (MSE): 3.83
R2 Score: -0.000418


In [27]:
merged_df

Unnamed: 0,timestamp,sat_id,sat_name,latency_ms,station_id,station_name,latitude,longitude,elevation_m,antenna_gain_dBi,f107_flux,ap_index,day_of_week,day_of_month,month,hour,is_weekend,days_since_start,anomaly_flag,is_anomaly,rolling_mean_latency
0,2025-07-21 03:42:06.300576,44714,STARLINK-1008,8.866683,STAT003,"Madrid, Spain",40.4314,-3.7026,650,50,145,5,0,21,7,3,0,14,1,False,8.866683
1,2025-07-21 12:09:08.491392,44716,STARLINK-1010,8.097588,STAT005,"Cape Town, South Africa",-33.9249,18.4241,15,46,145,5,0,21,7,12,0,14,1,False,8.482135
2,2025-07-20 15:05:45.419424,44717,STARLINK-1011,7.479486,STAT005,"Cape Town, South Africa",-33.9249,18.4241,15,46,150,7,6,20,7,15,1,13,1,False,8.147919
3,2025-07-21 03:10:14.918304,44718,STARLINK-1012,7.470059,STAT004,"Seongnam, South Korea",37.4138,127.5183,80,47,145,5,0,21,7,3,0,14,1,False,7.978454
4,2025-07-21 14:00:01.000224,44719,STARLINK-1013,4.922811,STAT005,"Cape Town, South Africa",-33.9249,18.4241,15,46,145,5,0,21,7,14,0,14,1,False,7.367325
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7949,2025-07-21 12:00:02.999808,64778,STARLINK-34537,6.760311,STAT005,"Cape Town, South Africa",-33.9249,18.4241,15,46,145,5,0,21,7,12,0,14,1,False,6.192166
7950,2025-07-21 06:00:00.999648,64779,STARLINK-34585,5.507390,STAT002,"Hawthorne, CA",33.9226,-118.3345,50,48,145,5,0,21,7,6,0,14,1,False,6.191747
7951,2025-07-21 12:00:02.999808,64780,STARLINK-34491,3.447139,STAT004,"Seongnam, South Korea",37.4138,127.5183,80,47,145,5,0,21,7,12,0,14,1,False,6.140010
7952,2025-07-21 06:00:00.999648,64781,STARLINK-34582,6.584457,STAT003,"Madrid, Spain",40.4314,-3.7026,650,50,145,5,0,21,7,6,0,14,1,False,6.147456


In [3]:
selected_features = [
    'hour',
    'msg_count_hour',
    'msg_count_sat_hour',
    'rolling_latency_50',
    'inter_arrival_time',
    'cumulative_msg_count',
    'f107_flux',
    'ap_index'
]

X = merged_df[selected_features]
y = merged_df['latency_ms']


In [4]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

# Optional: Get feature names
feature_names = poly.get_feature_names_out(selected_features)


In [5]:
from sklearn.model_selection import cross_val_score

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf_model, X_poly, y, cv=5, scoring='r2')

print("Cross-Validated R² Scores:", scores)
print(f"Average R²: {scores.mean():.2f}")


Cross-Validated R² Scores: [-0.27005184 -0.12205669 -0.08372133 -0.05400541 -0.06145978]
Average R²: -0.12


In [6]:
from sklearn.linear_model import Ridge

ridge_model = Ridge(alpha=1.0)
scores_ridge = cross_val_score(ridge_model, X_poly, y, cv=5, scoring='r2')

print("Ridge Regression Cross-Validated R² Scores:", scores_ridge)
print(f"Average R² (Ridge): {scores_ridge.mean():.2f}")


Ridge Regression Cross-Validated R² Scores: [-4.46740329e+08  2.11566230e-02 -1.12448873e-02  2.58581149e-02
  8.40357975e-03]
Average R² (Ridge): -89348065.74


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
