In [32]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Membaca data dari file CSV di Google Drive

In [33]:
import pandas as pd

# Load data from CSV file
file_path = "/content/drive/My Drive/GOOGLE COLAB/DATA SCIENCE/Week 10/Electric_Vehicle_Population_Data.csv"
data = pd.read_csv(file_path)

# Display the first few rows of the data
print(data.head(5))

   VIN (1-10)    County      City State  Postal Code  Model Year    Make  \
0  2T3YL4DV0E      King  Bellevue    WA      98005.0        2014  TOYOTA   
1  5YJ3E1EB6K      King   Bothell    WA      98011.0        2019   TESLA   
2  5UX43EU02S  Thurston   Olympia    WA      98502.0        2025     BMW   
3  JTMAB3FV5R  Thurston   Olympia    WA      98513.0        2024  TOYOTA   
4  5YJYGDEE8M    Yakima     Selah    WA      98942.0        2021   TESLA   

        Model                   Electric Vehicle Type  \
0        RAV4          Battery Electric Vehicle (BEV)   
1     MODEL 3          Battery Electric Vehicle (BEV)   
2          X5  Plug-in Hybrid Electric Vehicle (PHEV)   
3  RAV4 PRIME  Plug-in Hybrid Electric Vehicle (PHEV)   
4     MODEL Y          Battery Electric Vehicle (BEV)   

   Clean Alternative Fuel Vehicle (CAFV) Eligibility  Electric Range  \
0            Clean Alternative Fuel Vehicle Eligible           103.0   
1            Clean Alternative Fuel Vehicle Eligible    

Menampilkan jumlah baris dan kolom dalam dataset


In [34]:
print("Jumlah Baris dan Kolom : ", data.shape)

Jumlah Baris dan Kolom :  (232230, 17)


Menampilkan nama kolom dan tipe data

In [35]:
# Menampilkan tipe data
print("\nColumns and Data Types:")
print(data.dtypes)


Columns and Data Types:
VIN (1-10)                                            object
County                                                object
City                                                  object
State                                                 object
Postal Code                                          float64
Model Year                                             int64
Make                                                  object
Model                                                 object
Electric Vehicle Type                                 object
Clean Alternative Fuel Vehicle (CAFV) Eligibility     object
Electric Range                                       float64
Base MSRP                                            float64
Legislative District                                 float64
DOL Vehicle ID                                         int64
Vehicle Location                                      object
Electric Utility                                      object

Menampilkan data Missing Value

In [36]:
# Fungsi untuk menghitung jumlah nilai khusus di setiap kolom dalam DataFrame
def count_special_values(df):
    # Menghitung jumlah nilai NaN, NULL, atau None
    nan_count = df.isna().sum()

    # Menghitung jumlah string kosong ("")
    empty_string_count = (df == '').sum()

    # Menghitung jumlah nilai khusus (-9999) sebagai indikator data hilang
    special_indicator_count = (df == -9999).sum()

    # Mengembalikan hasil dalam bentuk DataFrame
    return pd.DataFrame({
        'NaN/NULL/None': nan_count,
        'Empty String': empty_string_count,
        'Special Indicator (-9999)': special_indicator_count
    })

# Menampilkan hasil perhitungan nilai khusus dalam DataFrame
print(count_special_values(data))


                                                   NaN/NULL/None  \
VIN (1-10)                                                     0   
County                                                         4   
City                                                           4   
State                                                          0   
Postal Code                                                    4   
Model Year                                                     0   
Make                                                           0   
Model                                                          0   
Electric Vehicle Type                                          0   
Clean Alternative Fuel Vehicle (CAFV) Eligibility              0   
Electric Range                                                27   
Base MSRP                                                     27   
Legislative District                                         481   
DOL Vehicle ID                                  

Fungsi untuk menangani data yang missing value

In [38]:
# Menghitung jumlah NaN di setiap kolom
nan_count = data.isnull().sum()

# Fungsi untuk menentukan apakah akan menggunakan mode atau mean
def get_replacement_value(column):
    # Jika semua nilai di kolom adalah NaN, kembalikan default None
    if column.dropna().empty:
        return None

    # Jika kolom adalah objek (string), gunakan mode jika ada
    if column.dtype == 'object':
        mode_value = column.mode().dropna()
        return mode_value[0] if not mode_value.empty else None

    # Jika kolom numerik, gunakan mean
    return column.dropna().mean()

# Loop melalui kolom yang memiliki nilai NaN (kecuali 'Postal Code' dan 'Legislative District')
for column in nan_count.index:
    if nan_count[column] > 0 and column not in ['Postal Code', 'Legislative District']:
        replacement_value = get_replacement_value(data[column])

        if replacement_value is not None:  # Pastikan nilai pengganti valid
            data[column] = data[column].fillna(replacement_value)
            print(f"Mengganti NaN di kolom '{column}' dengan {replacement_value}")

# Menangani 'Postal Code' dan 'Legislative District' secara terpisah, mengganti NaN dengan mode
for column in ['Postal Code', 'Legislative District']:
    if column in data.columns and nan_count[column] > 0:
        mode_value = data[column].mode().dropna()
        replacement_value = mode_value[0] if not mode_value.empty else None

        if replacement_value is not None:
            data[column] = data[column].fillna(replacement_value)
            print(f"Mengganti NaN di kolom '{column}' dengan {replacement_value}")

# Simpan data yang telah diproses ke file CSV baru
output_file_path = 'PreProcessed_Electric_Vehicle_Data.csv'
data.to_csv(output_file_path, index=False)

# Konfirmasi bahwa data telah disimpan
print(f"\nData yang telah diproses disimpan di: {output_file_path}")



Data yang telah diproses disimpan di: PreProcessed_Electric_Vehicle_Data.csv


Cek data setelah diatasi missing valuenya

In [39]:
# Cek Apakah data sudah bebas dari missing value
import pandas as pd

# Load data from CSV file
file_path = 'PreProcessed_Electric_Vehicle_Data.csv'
data2 = pd.read_csv(file_path)

In [40]:
# Menampilkan jumlah nilai NaN pada setiap kolom
nan_count = data2.isna().sum()

# Menampilkan hasil
print(nan_count)

VIN (1-10)                                           0
County                                               0
City                                                 0
State                                                0
Postal Code                                          0
Model Year                                           0
Make                                                 0
Model                                                0
Electric Vehicle Type                                0
Clean Alternative Fuel Vehicle (CAFV) Eligibility    0
Electric Range                                       0
Base MSRP                                            0
Legislative District                                 0
DOL Vehicle ID                                       0
Vehicle Location                                     0
Electric Utility                                     0
2020 Census Tract                                    0
dtype: int64


Pengecekan jumlah data unik masing masing kolom

In [41]:

# Display the number of unique values and column names for each column
print("\nJumlah nilai unik per kolom:")
print(data2.nunique())

print("\nNama kolom:")
print(data2.columns)


Jumlah nilai unik per kolom:
VIN (1-10)                                            13560
County                                                  209
City                                                    786
State                                                    49
Postal Code                                             950
Model Year                                               21
Make                                                     46
Model                                                   170
Electric Vehicle Type                                     2
Clean Alternative Fuel Vehicle (CAFV) Eligibility         3
Electric Range                                          110
Base MSRP                                                32
Legislative District                                     49
DOL Vehicle ID                                       232230
Vehicle Location                                        948
Electric Utility                                         76
2020 Censu

In [42]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder

In [43]:
# Load data from CSV file
file_path = 'PreProcessed_Electric_Vehicle_Data.csv'
data2 = pd.read_csv(file_path)

In [44]:
# Display the first few rows of the data
print(data2.head(3))

   VIN (1-10)    County      City State  Postal Code  Model Year    Make  \
0  2T3YL4DV0E      King  Bellevue    WA      98005.0        2014  TOYOTA   
1  5YJ3E1EB6K      King   Bothell    WA      98011.0        2019   TESLA   
2  5UX43EU02S  Thurston   Olympia    WA      98502.0        2025     BMW   

     Model                   Electric Vehicle Type  \
0     RAV4          Battery Electric Vehicle (BEV)   
1  MODEL 3          Battery Electric Vehicle (BEV)   
2       X5  Plug-in Hybrid Electric Vehicle (PHEV)   

  Clean Alternative Fuel Vehicle (CAFV) Eligibility  Electric Range  \
0           Clean Alternative Fuel Vehicle Eligible           103.0   
1           Clean Alternative Fuel Vehicle Eligible           220.0   
2           Clean Alternative Fuel Vehicle Eligible            40.0   

   Base MSRP  Legislative District  DOL Vehicle ID  \
0        0.0                  41.0       186450183   
1        0.0                   1.0       478093654   
2        0.0                  3

Menentukan feature mana yang merupakan Nominal dan Ordinal

In [45]:
print("\nColumns and Data Types:")
print(data2.dtypes)


Columns and Data Types:
VIN (1-10)                                            object
County                                                object
City                                                  object
State                                                 object
Postal Code                                          float64
Model Year                                             int64
Make                                                  object
Model                                                 object
Electric Vehicle Type                                 object
Clean Alternative Fuel Vehicle (CAFV) Eligibility     object
Electric Range                                       float64
Base MSRP                                            float64
Legislative District                                 float64
DOL Vehicle ID                                         int64
Vehicle Location                                      object
Electric Utility                                      object

In [46]:
# --- One-Hot Encoding for Electric Vehicle Type and Clean Alternative Fuel Vehicle (CAFV) Eligibility ---
onehot_encoder = OneHotEncoder(sparse_output=False)

In [47]:
# Encode 'Electric Vehicle Type' column
electric_vehicle_type_encoded = onehot_encoder.fit_transform(data2[['Electric Vehicle Type']])
electric_vehicle_type_df = pd.DataFrame(electric_vehicle_type_encoded, columns=onehot_encoder.categories_[0])

In [48]:
# Encode 'Clean Alternative Fuel Vehicle (CAFV) Eligibility' column
cafv_eligibility_encoded = onehot_encoder.fit_transform(data2[['Clean Alternative Fuel Vehicle (CAFV) Eligibility']])
cafv_eligibility_df = pd.DataFrame(cafv_eligibility_encoded, columns=onehot_encoder.categories_[0])

In [49]:
# Drop the original columns and add the encoded columns to the dataframe
data2 = data2.drop(['Electric Vehicle Type', 'Clean Alternative Fuel Vehicle (CAFV) Eligibility'], axis=1)
data2 = pd.concat([data2, electric_vehicle_type_df, cafv_eligibility_df], axis=1)

In [50]:
# --- Label Encoding for categorical columns ---
label_encoder = LabelEncoder()

In [51]:
# List of columns to apply label encoding
label_columns = ['County', 'City', 'State', 'Make', 'Model', 'Vehicle Location', 'Electric Utility', '2020 Census Tract']

# Apply label encoding to each specified column
for col in label_columns:
    data2[col] = label_encoder.fit_transform(data2[col])

In [52]:
# --- Ordinal Encoding for Model Year ---
# Ordinal Encoding pada Model Year menggunakan OrdinalEncoder
ordinal_encoder = OrdinalEncoder()

# Terapkan ordinal encoding ke kolom Model Year
data2['Model Year'] = ordinal_encoder.fit_transform(data2[['Model Year']])

In [53]:
# Display the first few rows of the encoded data
print("\nEncoded Data (First few rows):")
print(data2.head(3))


Encoded Data (First few rows):
   VIN (1-10)  County  City  State  Postal Code  Model Year  Make  Model  \
0  2T3YL4DV0E      91    47     46      98005.0         9.0    41    128   
1  5YJ3E1EB6K      91    64     46      98011.0        14.0    39     97   
2  5UX43EU02S     188   499     46      98502.0        20.0     5    163   

   Electric Range  Base MSRP  Legislative District  DOL Vehicle ID  \
0           103.0        0.0                  41.0       186450183   
1           220.0        0.0                   1.0       478093654   
2            40.0        0.0                  35.0       274800718   

   Vehicle Location  Electric Utility  2020 Census Tract  \
0               383                74               1025   
1               400                74                991   
2               637                73               2027   

   Battery Electric Vehicle (BEV)  Plug-in Hybrid Electric Vehicle (PHEV)  \
0                             1.0                               

In [54]:
# --- Save the processed data to a new CSV file ---
output_file_path = 'PreProcessed2nd_Electric_Vehicle_Data.csv'
data2.to_csv(output_file_path, index=False)
print(f"\nData saved to {output_file_path}")


Data saved to PreProcessed2nd_Electric_Vehicle_Data.csv
