In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
#data loading
data_model = pd.read_csv('laptopPrice (1).csv')

#exploratory data analysis
data_model.info()
data_model.describe()
data_model.isnull().sum()
data_model.duplicated().sum()

#detector outlier
Q1 = data_model['Price'].quantile(0.25)
Q3 = data_model['Price'].quantile(0.75)

IQR = Q3 - Q1

lower_data = Q1 - 1.5 * IQR
upper_data = Q3 + 1.5 * IQR

oulierts_detector = data_model[
    (data_model['Price'] < lower_data) | (data_model['Price'] > upper_data)
]

oulierts_detector
plt.boxplot(data_model['Price'])
plt.show()


#PREPROCESSING

#handling duplicate
cleaning_model = data_model.drop_duplicates().copy()
cleaning_model.duplicated().sum()

#outliers handling dengan metode clipping
cleaning_model['Price'] = cleaning_model['Price'].clip(lower_data, upper_data).copy()
plt.boxplot(cleaning_model['Price'])
plt.show()

#SPLIT DATASET
x = cleaning_model.drop(columns=['Price']) #artinya kita menghapus kolom Price data latih karnena itu adalah label
y = cleaning_model['Price'] #ini adalah labelnya

#membagi data latih dan data uji
x_latih, x_uji, y_latih, y_uji = train_test_split(x, y, test_size=0.3, random_state=42)

#feature encoding menguunakan one hot encoding

encode_model = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
x_latih_encoded = encode_model.fit_transform(x_latih[['brand', 'processor_brand', 'processor_name', 'processor_gnrtn', 'ram_gb', 'ram_type', 'ssd','hdd', 'os', 'os_bit', 'graphic_card_gb', 'weight', 'warranty', 'Touchscreen', 'msoffice', 'rating']])
x_uji_encoded = encode_model.transform(x_uji[['brand', 'processor_brand', 'processor_name', 'processor_gnrtn', 'ram_gb', 'ram_type', 'ssd','hdd', 'os', 'os_bit', 'graphic_card_gb', 'weight', 'warranty', 'Touchscreen', 'msoffice', 'rating']]) 

encode_df = pd.DataFrame(
    x_latih_encoded, columns=encode_model.get_feature_names_out()
)
print(encode_df.head())


#memisahkan data numerik

x_latih_numerik = x_latih.drop(columns=['brand', 'processor_brand', 'processor_name', 'processor_gnrtn', 'ram_gb', 'ram_type', 'ssd','hdd', 'os', 'os_bit', 'graphic_card_gb', 'weight', 'warranty', 'Touchscreen', 'msoffice', 'rating'])
x_uji_numerik = x_uji.drop(columns=['brand', 'processor_brand', 'processor_name', 'processor_gnrtn', 'ram_gb', 'ram_type', 'ssd','hdd', 'os', 'os_bit', 'graphic_card_gb', 'weight', 'warranty', 'Touchscreen', 'msoffice', 'rating'])

#menggambungkan data numerik dan kategorikal yang sudah di encode

x_latih_final = np.hstack([x_latih_numerik, x_latih_encoded])
x_uji_final = np.hstack([x_uji_numerik, x_uji_encoded])


#SCALLING DATA
#melihat data yang belum dikalikan standarisasi
plt.figure(figsize=(14,7))
plt.subplot(1,2,1)
sns.histplot(data_model['Price'], kde=True)
plt.title('Sebelum Standarisasi')
plt.show()

#data distandarisasi/scaling
scaler = StandardScaler()

x_latih_sdtr = scaler.fit_transform(x_latih_final)
x_uji_stdr = scaler.transform(x_uji_final)

print(x_latih_final.dtype)
print(x_uji_final.dtype)

#lihat data yang sudah distandarisasi

plt.figure(figsize=(14,7))
plt.subplot(1,2,1)
sns.histplot(x_latih_sdtr[:,0], kde=True)
plt.title('data uji yang telah di stadariasi')
plt.subplot(1,2,2)
sns.histplot(x_uji_stdr[:,0], kde=True)
plt.title('data uju yang telah di standarisasi')
plt.show()

print("Mean setelah scaling:", x_latih_sdtr.mean())
print("Std setelah scaling :", x_latih_sdtr.std())

#MODEL DEVELOPMENT

#model linear regression
#membuat objrek model

model = LGBMRegressor()
#melatih model
model.fit(x_latih_sdtr, y_latih)
#model evaluasi

y_pred = model.predict(x_uji_stdr)

mae = mean_absolute_error(y_uji, y_pred)
mse = mean_squared_error(y_uji, y_pred)
r2 = r2_score(y_uji, y_pred)

print("MAE: ", mae)
print("MSE: ", mse) 
print("R2 Score: ", r2)


