In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import numpy as np

# Membaca data
data_startup = pd.read_csv('50_Startups.csv')

# Menampilkan data untuk memastikan sudah dibaca dengan benar
data_startup.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [2]:
# OneHotEncoder untuk field State
encoder = OneHotEncoder()
state_encoded = encoder.fit_transform(data_startup[['State']]).toarray()

# Mendapatkan nama kolom baru setelah encoding
state_encoded_df = pd.DataFrame(state_encoded, columns=encoder.get_feature_names_out(['State']))

# Menggabungkan kembali dengan dataset asli dan menghapus kolom State yang asli
data_startup = pd.concat([data_startup, state_encoded_df], axis=1)
data_startup.drop('State', axis=1, inplace=True)

# Menampilkan data setelah one-hot encoding
data_startup.head()


Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_California,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,0.0,0.0,1.0
1,162597.7,151377.59,443898.53,191792.06,1.0,0.0,0.0
2,153441.51,101145.55,407934.54,191050.39,0.0,1.0,0.0
3,144372.41,118671.85,383199.62,182901.99,0.0,0.0,1.0
4,142107.34,91391.77,366168.42,166187.94,0.0,1.0,0.0


In [3]:
# Membuat field baru Tax
data_startup['Tax'] = (data_startup['Profit'] + data_startup['Marketing Spend'] + data_startup['Administration']) * 0.05

# Menampilkan data setelah menambahkan kolom Tax
data_startup.head()


Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_California,State_Florida,State_New York,Tax
0,165349.2,136897.8,471784.1,192261.83,0.0,0.0,1.0,40047.1865
1,162597.7,151377.59,443898.53,191792.06,1.0,0.0,0.0,39353.409
2,153441.51,101145.55,407934.54,191050.39,0.0,1.0,0.0,35006.524
3,144372.41,118671.85,383199.62,182901.99,0.0,0.0,1.0,34238.673
4,142107.34,91391.77,366168.42,166187.94,0.0,1.0,0.0,31187.4065


In [4]:
# Melakukan scaling menggunakan StandardScaler
scaler = StandardScaler()

# Mendefinisikan kolom yang akan di-scale
columns_to_scale = ['R&D Spend', 'Administration', 'Marketing Spend', 'Profit', 'Tax']

# Melakukan fit dan transform pada kolom yang di-scale
data_startup[columns_to_scale] = scaler.fit_transform(data_startup[columns_to_scale])

# Menampilkan dataset yang telah dipre-processing
print(data_startup.head())


   R&D Spend  Administration  Marketing Spend    Profit  State_California  \
0   1.969800        0.537431         2.181258  1.983399               0.0   
1   1.908599        1.089039         1.936193  1.971812               1.0   
2   1.704943       -0.824552         1.620132  1.953518               0.0   
3   1.503224       -0.156887         1.402755  1.752531               0.0   
4   1.452843       -1.196122         1.253080  1.340266               0.0   

   State_Florida  State_New York       Tax  
0            0.0             1.0  2.244597  
1            0.0             0.0  2.151520  
2            1.0             0.0  1.568348  
3            0.0             1.0  1.465334  
4            1.0             0.0  1.055980  
