# hypercarge_locations 

In [57]:
# Importing essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [58]:
# Load the dataset
hypercarge_location = pd.read_csv(r'dataset\full\hypercarge_locations.csv')

# Preview the first few rows
hypercarge_location.head()

Unnamed: 0,chargerId,serialNumber,sshurl,sshPort,gpsLat,gpsLong,numberStacks,chassis,emergencyStop,doorContactSwitch,...,corporationName,operatorName,hasVollmacht,hasCommissioningProtocol,surroundingChargers,isFavourite,sessionLiveViewEnabled,finishedEol,lendeeId,lendeeName
0,20800,23BZ1509B,10.245.161.30,,45.04139,11.710673,4,HYC_300,False,True,...,Free To X S.p.A.,,False,False,"[{'chargerId': 6772, 'gpsLat': 44.66244, 'gpsL...",False,False,True,,
1,17737,22BZ5330B,10.245.22.116,,42.029548,11.962168,4,HYC_300,False,True,...,Free To X S.p.A.,,False,False,"[{'chargerId': 6771, 'gpsLat': 42.49917, 'gpsL...",False,False,True,,
2,83776,24BZ1716B,10.246.181.216,,43.8127,11.154,4,HYC_300,False,True,...,Free To X S.p.A.,,False,False,"[{'chargerId': 6772, 'gpsLat': 44.66244, 'gpsL...",False,,True,,
3,19519,23BZ1033B,10.245.29.175,,42.034378,12.648893,4,HYC_300,False,True,...,Free To X S.p.A.,,False,False,"[{'chargerId': 6771, 'gpsLat': 42.49917, 'gpsL...",False,False,True,,
4,83781,24BZ1721B,10.246.186.62,,,,4,HYC_300,False,True,...,Free To X S.p.A.,,False,False,,False,,True,,


In [59]:
# Check the statistics of numeric columns
hypercarge_location.describe()

Unnamed: 0,chargerId,sshPort,gpsLat,gpsLong,numberStacks,simIccid,serviceExpirationDate,softwareExpirationDate,locationZipCode,calibrationLawOrdered,...,defaultSoftwareVersion,officialEichrechtCompletion,endClientId,distributorId,corporationId,operatorId,endClientName,operatorName,lendeeId,lendeeName
count,374.0,0.0,259.0,259.0,374.0,372.0,0.0,0.0,153.0,0.0,...,0.0,0.0,0.0,374.0,374.0,0.0,0.0,0.0,0.0,0.0
mean,43247.780749,,41.117911,6.1439,3.823529,8.94502e+18,,,35724.941176,,...,,,,184.350267,186.018717,,,,,
std,33380.514195,,9.314472,15.581104,0.42177,7217696.0,,,24216.932744,,...,,,,84.130167,85.055996,,,,,
min,6590.0,,0.0,-50.0,1.0,8.94502e+18,,,10.0,,...,,,,180.0,180.0,,,,,
25%,13418.25,,41.849835,9.046675,4.0,8.94502e+18,,,20045.0,,...,,,,180.0,180.0,,,,,
50%,19747.5,,43.818792,11.419845,4.0,8.94502e+18,,,31032.0,,...,,,,180.0,180.0,,,,,
75%,83748.75,,44.962939,12.903881,4.0,8.94502e+18,,,50019.0,,...,,,,180.0,180.0,,,,,
max,83842.0,,57.0,16.777855,4.0,8.94502e+18,,,83100.0,,...,,,,1807.0,1807.0,,,,,


## Handling missing values

In [60]:
# Check for missing values
hypercarge_location.isnull().sum()

chargerId                   0
serialNumber                0
sshurl                      0
sshPort                   374
gpsLat                    115
                         ... 
isFavourite                 0
sessionLiveViewEnabled    160
finishedEol                 0
lendeeId                  374
lendeeName                374
Length: 74, dtype: int64

In [61]:
# Check for duplicates
hypercarge_location.duplicated().sum()

0

The maximumn number of columns is 374, so I deleted all the features with 374 missing values.

In [62]:
# Check for columns with exactly 374 missing values
missing_value_count = hypercarge_location.isnull().sum()

# Identify columns where the missing value count is exactly 374
columns_to_drop = missing_value_count[missing_value_count == 374].index

# Drop those columns
hypercarge_location_cleaned = hypercarge_location.drop(columns=columns_to_drop)

# Check the result
print(f"Dropped columns: {list(columns_to_drop)}")
print(hypercarge_location_cleaned.head())

Dropped columns: ['sshPort', 'serviceExpirationDate', 'softwareExpirationDate', 'calibrationLawOrdered', 'lastMaintenance', 'defaultSoftwareVersion', 'officialEichrechtCompletion', 'endClientId', 'operatorId', 'endClientName', 'operatorName', 'lendeeId', 'lendeeName']
   chargerId serialNumber          sshurl     gpsLat    gpsLong  numberStacks  \
0      20800    23BZ1509B   10.245.161.30  45.041390  11.710673             4   
1      17737    22BZ5330B   10.245.22.116  42.029548  11.962168             4   
2      83776    24BZ1716B  10.246.181.216  43.812700  11.154000             4   
3      19519    23BZ1033B   10.245.29.175  42.034378  12.648893             4   
4      83781    24BZ1721B   10.246.186.62        NaN        NaN             4   

   chassis  emergencyStop  doorContactSwitch standAloneOrBackend  ...  \
0  HYC_300          False               True             Backend  ...   
1  HYC_300          False               True             Backend  ...   
2  HYC_300          False

New dataset name "hypercarge_location_cleaned" after deleting all features have 374 missing values.

In [63]:
# Find features with missing values
missing_values = hypercarge_location_cleaned.isnull().sum()

# Filter to show only columns with missing values
missing_values = missing_values[missing_values > 0]

# Display features with missing values and the count
print(missing_values)

gpsLat                    115
gpsLong                   115
simIccid                    2
commissioningDate         107
chargePointIdentity       113
customerIccid             336
warrantyExpiration        160
websocketUrl              114
locationStreet            221
locationTown              221
locationZipCode           221
locationProvince          221
locationCountry           147
locationUpdateNote        252
status_hycErrorCode        85
status_text               213
simActivationDate          35
simLastRequest             35
hymaintVersion             48
surroundingChargers       115
sessionLiveViewEnabled    160
dtype: int64


There are still a lot of features with too many missing values. Continue dropping features with > 200 missing values.

In [64]:
# Identify columns where the missing value count is exactly 374
columns_to_drop = missing_value_count[missing_value_count > 200].index

# Drop those columns
hypercarge_location_cleaned = hypercarge_location.drop(columns=columns_to_drop)

# Check the result
print(f"Dropped columns: {list(columns_to_drop)}")
print(hypercarge_location_cleaned.head())

Dropped columns: ['sshPort', 'customerIccid', 'serviceExpirationDate', 'softwareExpirationDate', 'locationStreet', 'locationTown', 'locationZipCode', 'locationProvince', 'locationUpdateNote', 'calibrationLawOrdered', 'status_text', 'lastMaintenance', 'defaultSoftwareVersion', 'officialEichrechtCompletion', 'endClientId', 'operatorId', 'endClientName', 'operatorName', 'lendeeId', 'lendeeName']
   chargerId serialNumber          sshurl     gpsLat    gpsLong  numberStacks  \
0      20800    23BZ1509B   10.245.161.30  45.041390  11.710673             4   
1      17737    22BZ5330B   10.245.22.116  42.029548  11.962168             4   
2      83776    24BZ1716B  10.246.181.216  43.812700  11.154000             4   
3      19519    23BZ1033B   10.245.29.175  42.034378  12.648893             4   
4      83781    24BZ1721B   10.246.186.62        NaN        NaN             4   

   chassis  emergencyStop  doorContactSwitch standAloneOrBackend  ...  \
0  HYC_300          False               True

In [65]:
# Find features with missing values
missing_values = hypercarge_location_cleaned.isnull().sum()

# Filter to show only columns with missing values
missing_values = missing_values[missing_values > 0]

# Display features with missing values and the count
print(missing_values)

gpsLat                    115
gpsLong                   115
simIccid                    2
commissioningDate         107
chargePointIdentity       113
warrantyExpiration        160
websocketUrl              114
locationCountry           147
status_hycErrorCode        85
simActivationDate          35
simLastRequest             35
hymaintVersion             48
surroundingChargers       115
sessionLiveViewEnabled    160
dtype: int64


simActivationDate and simLastRequest both have 35 missing values, I want to see if when simActivationDate = NaN, simLastRequest is also NaN

In [66]:
# Check if simLastRequest is NaN when simActivationDate is NaN
nan_check = hypercarge_location_cleaned[hypercarge_location_cleaned['simActivationDate'].isnull()]['simLastRequest'].isnull().value_counts()

# Display the results
print(nan_check)

simLastRequest
True    35
Name: count, dtype: int64


There is a feature simActivated (Boolean), I want to check if simActivated = False, both simActivationDate and simLastRequest = NaN

In [67]:
# Check if both simActivationDate and simLastRequest are NaN when simActivated is False
nan_check_sim_activated_false = hypercarge_location_cleaned.loc[
    hypercarge_location_cleaned['simActivated'] == False
]

# Count how many of those rows have simActivationDate and simLastRequest as NaN
count_nan_rows = nan_check_sim_activated_false[
    nan_check_sim_activated_false['simActivationDate'].isnull() & 
    nan_check_sim_activated_false['simLastRequest'].isnull()
].shape[0]

# Display the result
print("Number of rows where simActivated is False and both simActivationDate and simLastRequest are NaN:", count_nan_rows)

Number of rows where simActivated is False and both simActivationDate and simLastRequest are NaN: 35


Since both simActivationDate and simLastRequest are NaN when simActivated is False, their utility in the analysis may be limited => Drop these 2 features.

In [68]:
# Delete the features from the dataset
hypercarge_location_cleaned.drop(columns=['simActivationDate', 'simLastRequest'], inplace=True)

I wanted to check if simIccid important feature. First, let's see how many different values are there in that feature

In [69]:
# Count the number of unique values in the ICCID feature (simIccid)
unique_iccid_values_count = hypercarge_location_cleaned['simIccid'].nunique()

# Display the count of unique values
print("Number of unique values in simIccid:", unique_iccid_values_count)

Number of unique values in simIccid: 107


There are 107 different values which is a large number. There might be ok to delete this feature.

In [70]:
# Delete the IsimIccidCCID feature from the dataset
hypercarge_location_cleaned.drop(columns=['simIccid'], inplace=True)

Let's see if chargePointIdentity is worth to analysis.

In [71]:
# Count the number of unique values in the feature "Unique identity or name of the charge point"
unique_charge_point_identity_count = hypercarge_location_cleaned['chargePointIdentity'].nunique()

# Display the count of unique values
print("Number of unique values in chargePointIdentity:", unique_charge_point_identity_count)

Number of unique values in chargePointIdentity: 258


There are also too many different ID for this feature. Also, we already have ID for charger, I think it is not important to include this feature into analysing.

In [72]:
# Delete the chargePointIdentity feature from the dataset
hypercarge_location_cleaned.drop(columns=['chargePointIdentity'], inplace=True)

With too many missing values, warrantyExpiration (160), websocketUrl (114); hymaintVersion; commissioningDate (107) affect operational and maintenance aspects rather than directly influencing customer behaviour. It is reasonable to delete this feature.

In [73]:
# Delete warrantyExpiration, websocketUrl and hymaintVersion feature from the dataset
hypercarge_location_cleaned.drop(columns=['warrantyExpiration', 'websocketUrl', 'hymaintVersion', 'commissioningDate'], inplace=True)

Diagnose status_hycErrorCode

In [74]:
# Count the number of unique values in the feature "status_hycErrorCode" and display those values
unique_hyc_error_codes = hypercarge_location_cleaned['status_hycErrorCode'].value_counts()

# Display the count of unique values and the unique values themselves
unique_hyc_error_codes_count = unique_hyc_error_codes.shape[0]

print("Number of unique values in status_hycErrorCode:", unique_hyc_error_codes_count)
print("Unique values and their counts:\n", unique_hyc_error_codes)

Number of unique values in status_hycErrorCode: 2
Unique values and their counts:
 status_hycErrorCode
NoError        283
Door Opened      6
Name: count, dtype: int64


In [75]:
# Fill missing values in the status_hycErrorCode with "Unknown"
hypercarge_location_cleaned['status_hycErrorCode'] = hypercarge_location_cleaned['status_hycErrorCode'].fillna('Unknown')

Diagnose sessionLiveViewEnabled

In [76]:
# Count the number of unique values in the feature "status_hycErrorCode" and display those values
unique_sessionLiveViewEnabled = hypercarge_location_cleaned['sessionLiveViewEnabled'].value_counts()

# Display the count of unique values and the unique values themselves
unique_sessionLiveViewEnabled_count = unique_sessionLiveViewEnabled.shape[0]

print("Number of unique values in sessionLiveViewEnabled:", unique_sessionLiveViewEnabled_count)
print("Unique values and their counts:\n", unique_sessionLiveViewEnabled)

Number of unique values in sessionLiveViewEnabled: 1
Unique values and their counts:
 sessionLiveViewEnabled
False    214
Name: count, dtype: int64


There could be reasonable to consider missing value as True in this context

In [77]:
# Ensure the sessionLiveViewEnabled column is of type boolean before filling NaNs
hypercarge_location_cleaned['sessionLiveViewEnabled'] = hypercarge_location_cleaned['sessionLiveViewEnabled'].astype('boolean')

# Fill missing values in the sessionLiveViewEnabled with True
hypercarge_location_cleaned['sessionLiveViewEnabled'] = hypercarge_location_cleaned['sessionLiveViewEnabled'].fillna(True)

Since FreeX currently only operates in Italy => safely delete locationCountry

In [80]:
hypercarge_location_cleaned.drop(columns=['locationCountry'], inplace=True)

Diagnose gpsLat gpsLong and surroundingChargers:
They all have 115 missing values

In [82]:
# Check if gpsLat, gpsLong, and surroundingChargers all have NaN values simultaneously
missing_combination = hypercarge_location_cleaned[
    hypercarge_location_cleaned['gpsLat'].isnull() & 
    hypercarge_location_cleaned['gpsLong'].isnull() & 
    hypercarge_location_cleaned['surroundingChargers'].isnull()
]

# Count the number of rows that meet this condition
count_missing_combination = missing_combination.shape[0]

# Display the result
print("Number of rows where gpsLat, gpsLong, and surroundingChargers are all NaN:", count_missing_combination)

Number of rows where gpsLat, gpsLong, and surroundingChargers are all NaN: 115


We don't know how too fill NaN values for these features. For now, keep them.

In [81]:
# Find features with missing values
missing_values = hypercarge_location_cleaned.isnull().sum()

# Filter to show only columns with missing values
missing_values = missing_values[missing_values > 0]

# Display features with missing values and the count
print(missing_values)

gpsLat                 115
gpsLong                115
surroundingChargers    115
dtype: int64


## Diagnose single feature

In [87]:
# Get the data types of each feature in the dataset
feature_types = hypercarge_location_cleaned.dtypes

# Get the count of unique values for each feature in the dataset
unique_value_counts = {feature: hypercarge_location_cleaned[feature].nunique() for feature in hypercarge_location_cleaned.columns}

# Create a DataFrame to display features, their types, and unique value counts
combined_info = pd.DataFrame({
    'Feature': feature_types.index,
    'Type': feature_types.values,
    'Unique Value Count': [unique_value_counts[feature] for feature in feature_types.index]
})

# Display the combined DataFrame
print(combined_info)

                      Feature     Type  Unique Value Count
0                   chargerId    int64                 374
1                serialNumber   object                 374
2                      sshurl   object                 374
3                      gpsLat  float64                 232
4                     gpsLong  float64                 232
5                numberStacks    int64                   3
6                     chassis   object                   3
7               emergencyStop     bool                   1
8           doorContactSwitch     bool                   2
9         standAloneOrBackend   object                   1
10                   isPublic     bool                   1
11    prohibitSoftwareupdates     bool                   1
12         needsSpecialAccess     bool                   1
13             notServiceable     bool                   1
14              deadManSwitch     bool                   2
15                  hwVersion    int64                  

List of deleted features and reason(s):
1. Technical Specifications not directly influencing customer behaviour: serialNumber, sshurl, outletList, allNotes
2. Only have 1 value, don't have valuable analysis result: standAloneOrBackend, isPublic, prohibitSoftwareupdates, needsSpecialAccess, notServiceable, hwVersion, isRemoteLocation, status_position, status_isPrivate, excludeFromStatistics, isUtilityExecutionBlocked, creditCardTerminalActive, enableClientWebinterface, hasVollmacht, hasCommissioningProtocol, finishedEol


In [88]:
features_to_delete = [
    'serialNumber',
    'sshurl',
    'outletList',
    'allNotes',
    'standAloneOrBackend',
    'isPublic',
    'prohibitSoftwareupdates',
    'needsSpecialAccess',
    'notServiceable',
    'hwVersion',
    'isRemoteLocation',
    'status_position',
    'status_isPrivate',
    'excludeFromStatistics',
    'isUtilityExecutionBlocked',
    'creditCardTerminalActive',
    'enableClientWebinterface',
    'hasVollmacht',
    'hasCommissioningProtocol',
    'finishedEol'
]

# Delete the specified features from the dataset
hypercarge_location_cleaned.drop(columns=features_to_delete, inplace=True)

In [89]:
# Get the data types of each feature in the dataset
feature_types = hypercarge_location_cleaned.dtypes

# Get the count of unique values for each feature in the dataset
unique_value_counts = {feature: hypercarge_location_cleaned[feature].nunique() for feature in hypercarge_location_cleaned.columns}

# Create a DataFrame to display features, their types, and unique value counts
combined_info = pd.DataFrame({
    'Feature': feature_types.index,
    'Type': feature_types.values,
    'Unique Value Count': [unique_value_counts[feature] for feature in feature_types.index]
})

# Display the combined DataFrame
print(combined_info)

                   Feature     Type  Unique Value Count
0                chargerId    int64                 374
1                   gpsLat  float64                 232
2                  gpsLong  float64                 232
3             numberStacks    int64                   3
4                  chassis   object                   3
5            emergencyStop     bool                   1
6        doorContactSwitch     bool                   2
7            deadManSwitch     bool                   2
8            telemetrySave    int64                   3
9         hymaintInstalled     bool                   2
10          lastSignOfLife   object                 142
11        status_errorcode   object                   3
12     status_hycErrorCode   object                   3
13           status_status   object                   5
14       status_updatetime   object                 362
15            simActivated     bool                   2
16                isActive     bool             

1. There are some chargerId have the same gpsLat and gpsLong? (374-232=142)
2. There are some chargerID have the same surroundingChargers? (374-235=139)
(take in to consideration missing value for gpsLat and gpsLong)