# Optimise_Task 2

In [1]:
!pip install numpy pandas matplotlib seaborn scikit-learn

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import geopandas as gpd
import folium
import osmnx as ox
from shapely.geometry import Point

In [3]:
df = pd.read_csv(r'D:\BIG_DATA\external_data\optimise_task2.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,gpsLat,gpsLong,type,physicalPosition,session.averagePower,session.creationTime,session.creationTime.1,session.type,session.position,carChargeParameter.maxPower,carChargeParameter.current,numberStacks,chargerId
0,0,43.075102,13.842823,CCS2_400,4.0,,2024-07-14T06:01:27.461303+00:00,2024-07-14T06:01:27.461303+00:00,,2,0.0,500.0,4.0,13418.0
1,1,44.51834,11.21432,CCS2_400,4.0,,2024-07-14T06:01:29.119003+00:00,2024-07-14T06:01:29.119003+00:00,,2,0.0,200.0,4.0,8118.0
2,2,41.891882,12.758657,CCS2_400,1.0,,2024-07-14T06:01:58.304994+00:00,2024-07-14T06:01:58.304994+00:00,,1,0.0,500.0,4.0,13448.0
3,3,46.206436,13.048325,CCS2_400,1.0,,2024-07-14T06:03:00.278437+00:00,2024-07-14T06:03:00.278437+00:00,,1,0.0,500.0,4.0,19528.0
4,4,45.7293,9.02912,CCS2_400,4.0,,2024-07-14T06:03:25.545657+00:00,2024-07-14T06:03:25.545657+00:00,,2,129000.0,300.0,4.0,9839.0


In [5]:
# Drop the 'Unnamed: 0' column
df.drop('Unnamed: 0', axis=1, inplace=True)

In [6]:
# Check for missing values
print(df.isnull().sum())

gpsLat                           464
gpsLong                          464
type                               0
physicalPosition                  35
session.averagePower           13442
session.creationTime               0
session.creationTime.1             0
session.type                   13442
session.position                   0
carChargeParameter.maxPower       80
carChargeParameter.current        80
numberStacks                    1231
chargerId                       1231
dtype: int64


In [7]:
# Drop columns where all values are NaN
df.dropna(axis=1, how='all', inplace=True)

In [8]:
# Check for missing values
print(df.isnull().sum())

gpsLat                          464
gpsLong                         464
type                              0
physicalPosition                 35
session.creationTime              0
session.creationTime.1            0
session.position                  0
carChargeParameter.maxPower      80
carChargeParameter.current       80
numberStacks                   1231
chargerId                      1231
dtype: int64


In [9]:
# Drop rows where either 'gpsLat' or 'gpsLong' is NaN
df.dropna(subset=['gpsLat', 'gpsLong'], inplace=True)

In [10]:
# Check for missing values
print(df.isnull().sum())

gpsLat                           0
gpsLong                          0
type                             0
physicalPosition                35
session.creationTime             0
session.creationTime.1           0
session.position                 0
carChargeParameter.maxPower     76
carChargeParameter.current      76
numberStacks                   767
chargerId                      767
dtype: int64


In [11]:
# Drop rows where 'carChargeParameter.current' is NaN
df.dropna(subset=['carChargeParameter.current'], inplace=True)

In [12]:
# Check for missing values
print(df.isnull().sum())

gpsLat                           0
gpsLong                          0
type                             0
physicalPosition                35
session.creationTime             0
session.creationTime.1           0
session.position                 0
carChargeParameter.maxPower      0
carChargeParameter.current       0
numberStacks                   760
chargerId                      760
dtype: int64


In [13]:
# Drop rows where 'physicalPosition' is NaN
df.dropna(subset=['physicalPosition'], inplace=True)

In [14]:
# Drop rows where either 'numberStacks' or 'chargerId' is NaN
df.dropna(subset=['numberStacks', 'chargerId'], inplace=True)

In [15]:
# Check for missing values
print(df.isnull().sum())

gpsLat                         0
gpsLong                        0
type                           0
physicalPosition               0
session.creationTime           0
session.creationTime.1         0
session.position               0
carChargeParameter.maxPower    0
carChargeParameter.current     0
numberStacks                   0
chargerId                      0
dtype: int64


In [16]:
df = df.drop('session.creationTime.1', axis=1)

In [17]:
# Ensure 'session.creationTime' is a datetime type
df['session.creationTime'] = pd.to_datetime(df['session.creationTime'])

In [18]:
# Extract the month from the 'session.creationTime'
df['month'] = df['session.creationTime'].dt.month

# Calculate the average current per month over all years
monthly_average = df.groupby(['chargerId', 'month'])['carChargeParameter.current'].mean().reset_index()
monthly_average.rename(columns={'carChargeParameter.current': 'average_current_per_month'}, inplace=True)

# Aggregate charger data
charger_data = df.groupby('chargerId').agg({
    'gpsLat': 'first',  # Assuming latitude doesn't change
    'gpsLong': 'first',  # Assuming longitude doesn't change
    'type': 'first',  # Assuming type doesn't change
    'physicalPosition': 'first',  # Assuming physical position doesn't change
    'session.position': 'first',  # Assuming session position is constant; adjust if needed
    'carChargeParameter.maxPower': 'first',  # Assuming max power doesn't change
    'numberStacks': 'first'  # Assuming number of stacks doesn't change
}).reset_index()

In [19]:
# Merge the average current data with the charger data
final_dataset = pd.merge(charger_data, monthly_average, on='chargerId', how='left')

In [20]:
final_dataset.head()

Unnamed: 0,chargerId,gpsLat,gpsLong,type,physicalPosition,session.position,carChargeParameter.maxPower,numberStacks,month,average_current_per_month
0,6590.0,44.66229,10.85797,CCS2_400,1.0,1,130000.0,4.0,7,479.057143
1,6590.0,44.66229,10.85797,CCS2_400,1.0,1,130000.0,4.0,8,449.693878
2,6708.0,42.49927,12.31106,CCS2_400,1.0,1,326000.0,4.0,7,429.75
3,6708.0,42.49927,12.31106,CCS2_400,1.0,1,326000.0,4.0,8,409.259259
4,6709.0,44.66229,10.85801,CCS2_400,1.0,1,326000.0,4.0,7,435.372549


In [21]:
# Calculate the average current per charger across all months
average_current = final_dataset.groupby('chargerId')['average_current_per_month'].mean().reset_index()
average_current.rename(columns={'average_current_per_month': 'average_current_all_months'}, inplace=True)

# Aggregate charger data
charger_data = final_dataset[['chargerId', 'gpsLat', 'gpsLong', 'type', 'physicalPosition', 
                   'session.position', 'carChargeParameter.maxPower', 'numberStacks']].drop_duplicates()

# Merge the average current data with the charger data
final_dataset = pd.merge(charger_data, average_current, on='chargerId', how='left')

In [22]:
# Output the final dataset
final_dataset.head()

Unnamed: 0,chargerId,gpsLat,gpsLong,type,physicalPosition,session.position,carChargeParameter.maxPower,numberStacks,average_current_all_months
0,6590.0,44.66229,10.85797,CCS2_400,1.0,1,130000.0,4.0,464.37551
1,6708.0,42.49927,12.31106,CCS2_400,1.0,1,326000.0,4.0,419.50463
2,6709.0,44.66229,10.85801,CCS2_400,1.0,1,326000.0,4.0,459.573484
3,6710.0,42.49929,12.31107,CCS2_400,4.0,2,326000.0,4.0,472.371078
4,6724.0,45.321012,9.375258,CCS2_400,1.0,1,129000.0,4.0,395.642547


In [23]:
final_dataset.describe()

Unnamed: 0,chargerId,gpsLat,gpsLong,physicalPosition,session.position,carChargeParameter.maxPower,numberStacks,average_current_all_months
count,219.0,219.0,219.0,219.0,219.0,219.0,219.0,219.0
mean,20667.908676,43.762222,11.654207,2.383562,1.461187,112775.068493,3.972603,447.280451
std,20253.174315,1.601791,2.183586,1.4989,0.499633,125837.364065,0.233006,59.855966
min,6590.0,40.837697,8.44112,1.0,1.0,0.0,1.0,0.0
25%,9919.5,42.202297,9.39379,1.0,1.0,0.0,4.0,424.291074
50%,16938.0,44.392329,11.57079,1.0,1.0,50000.0,4.0,446.653857
75%,19527.5,45.074655,13.258022,4.0,2.0,250000.0,4.0,470.086953
max,83836.0,46.499058,16.777855,4.0,2.0,337000.0,4.0,799.0


In [24]:
final_dataset.dtypes

chargerId                      float64
gpsLat                         float64
gpsLong                        float64
type                            object
physicalPosition               float64
session.position                 int64
carChargeParameter.maxPower    float64
numberStacks                   float64
average_current_all_months     float64
dtype: object