In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder
import numpy as np
import seaborn as sns

In [None]:
item =  pd.read_csv("DMC_2017_task/items.csv", sep = "|")
item.drop_duplicates(keep='first', inplace=True)
item['pharmForm'] = item['pharmForm'].str.upper()

train  = pd.read_csv("DMC_2017_task/train.csv", sep = "|")

mytrain = train.drop(['lineID', 'revenue'], axis =1)
mytrain = mytrain.merge(item, on = 'pid')

In [None]:
features_use = ['rrp', 'competitorPrice', 'click', 'basket', 'order', 'price']
mytrain2 = mytrain[features_use] 

In [None]:
print(f"len of mytrain is {len(mytrain)}")

mytrain2.drop_duplicates(keep = 'first', inplace = True)
print(f"len of mytrain after drop duplicates is {len(mytrain2)}")
mytrain2.info()

In [None]:
# #Loại bỏ cột campaignIndex 
# #Bổ sung các giá trị bị thiếu
# mytrain2 = mytrain.drop(['campaignIndex'], axis=1)
# object_columns = [col for col in mytrain1.columns if mytrain1[col].dtype == 'O']
# label_encoder = LabelEncoder()
# for col in object_columns:
#     mytrain2[col] = label_encoder.fit_transform(mytrain1[col])
mytrain2 = mytrain2.fillna(0)
# mytrain1.info()

In [None]:
# features_use = ['rrp', 'competitorPrice', 'click', 'basket', 'order', 'price']
# mytrain2 = mytrain1[features_use] 

In [None]:
mytrain2

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(mytrain2.drop(['price'], axis = 1), mytrain2['price'], test_size= 0.2, random_state=42)

In [None]:
reg = LinearRegression().fit(X_train, Y_train)
Y_pred = reg.predict(X_test)
print(f"MSE: {mean_squared_error(Y_test, Y_pred):.4f} \n MAE: {mean_absolute_error(Y_test, Y_pred):.4f} \n R^2 {r2_score(Y_test, Y_pred)}")

In [None]:
dtr = DecisionTreeRegressor().fit(X_train, Y_train)
Y_pred = dtr.predict(X_test)
print(f"MSE: {mean_squared_error(Y_test, Y_pred):.4f} \n MAE: {mean_absolute_error(Y_test, Y_pred):.4f} \n R^2 {r2_score(Y_test, Y_pred)}")

In [None]:
dtr = DecisionTreeRegressor(min_impurity_decrease=0.0005, random_state=42).fit(X_train, Y_train)
Y_pred = dtr.predict(X_test)
print(f"MSE: {mean_squared_error(Y_test, Y_pred):.4f} \n MAE: {mean_absolute_error(Y_test, Y_pred):.4f} \n R^2 {r2_score(Y_test, Y_pred)}")

In [None]:
ranfor_re1 = RandomForestRegressor(random_state=42)
ranfor_re1.fit(X_train, Y_train)
Y_pred = ranfor_re1.predict(X_test)
print(f"MSE: {mean_squared_error(Y_test, Y_pred):.4f} \n MAE: {mean_absolute_error(Y_test, Y_pred):.4f} \n R^2 {r2_score(Y_test, Y_pred)}")

In [None]:
ranfor_re2 = RandomForestRegressor(max_depth= 50, min_samples_leaf=2, min_samples_split=4, random_state=42)
ranfor_re2.fit(X_train, Y_train)
Y2_pred = ranfor_re2.predict(X_test)
print(f"MSE: {mean_squared_error(Y_test, Y2_pred):.4f} \n MAE: {mean_absolute_error(Y_test, Y2_pred):.4f} \n R^2 {r2_score(Y_test, Y2_pred)}")

In [None]:
ranfor_re3 = RandomForestRegressor(max_depth= 30, min_samples_leaf=2, min_samples_split=4, random_state=42)
ranfor_re3.fit(X_train, Y_train)
Y3_pred = ranfor_re3.predict(X_test)
print(f"MSE: {mean_squared_error(Y_test, Y3_pred):.4f} \n MAE: {mean_absolute_error(Y_test, Y3_pred):.4f} \n R^2 {r2_score(Y_test, Y3_pred)}")

In [None]:
ranfor_re4 = RandomForestRegressor(max_depth= 25, min_samples_leaf=4, min_samples_split=6, random_state=42)
ranfor_re4.fit(X_train, Y_train)
Y4_pred = ranfor_re4.predict(X_test)
print(f"MSE: {mean_squared_error(Y_test, Y4_pred):.4f} \n MAE: {mean_absolute_error(Y_test, Y4_pred):.4f} \n R^2 {r2_score(Y_test, Y4_pred)}")

RandomForestClass

In [None]:
num_bins = 800  # Số lượng lớp bạn muốn chia
bins = np.linspace(min(Y_train), max(Y_train), num_bins)  # Tạo các khoảng
Y_train_discrete = np.digitize(Y_train, bins)  # Chuyển đổi nhãn thành các lớp
min_Y = min(Y_train)
max_Y = max(Y_train)


In [None]:
max_Y

In [None]:
ranfor_cl1 = RandomForestClassifier(n_estimators = 25, max_depth= 25, random_state=42, min_samples_split=8, min_samples_leaf=4)
ranfor_cl1.fit(X_train, Y_train_discrete)
Y_cl1_pred = ranfor_cl1.predict(X_test)
Y_cl1_pred = np.array(Y_cl1_pred, dtype = float)
for i in range(len(Y_cl1_pred)):
    Y_cl1_pred[i] = Y_cl1_pred[i]*(max_Y - min_Y)/num_bins -1/2* (max_Y - min_Y)/num_bins 
print(f"MSE: {mean_squared_error(Y_test, Y_cl1_pred):.4f} \n MAE: {mean_absolute_error(Y_test, Y_cl1_pred):.4f} \n R^2 {r2_score(Y_test, Y_cl1_pred)}")

In [None]:
ranfor_cl2 = RandomForestClassifier(n_estimators = 50,max_depth= 50, random_state=42, min_samples_split=4, min_samples_leaf=2, bootstrap=True, max_samples=50000)
ranfor_cl2.fit(X_train, Y_train_discrete)
Y_cl2_pred = ranfor_cl2.predict(X_test)
Y_cl2_pred = np.array(Y_cl2_pred, dtype = float)
for i in range(len(Y_cl2_pred)):
    Y_cl2_pred[i] = Y_cl2_pred[i]*(max_Y - min_Y)/num_bins -1/2* (max_Y - min_Y)/num_bins 
print(f"MSE: {mean_squared_error(Y_test, Y_cl2_pred):.4f} \n MAE: {mean_absolute_error(Y_test, Y_cl2_pred):.4f} \n R^2 {r2_score(Y_test, Y_cl2_pred)}")

In [None]:
import tensorflow as tf

# Kiểm tra xem TensorFlow có hỗ trợ GPU (CUDA) không
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

# In thông tin chi tiết về GPU (nếu có)
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    for device in physical_devices:
        print("Name:", device.name, "Type:", device.device_type)
else:
    print("No GPU devices found.")


In [None]:
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Xây dựng mô hình kết hợp LSTM, Dropout, Dense, BatchNormalization và AveragePooling
model = keras.Sequential([
    keras.layers.Dense(512, input_shape=(X_train.shape[1],), activation='relu'),
    keras.layers.LayerNormalization(),
    keras.layers.Dropout(0.2),

    keras.layers.Dense(126, activation='relu'),
    keras.layers.LayerNormalization(),
    keras.layers.Dropout(0.2),
    
    keras.layers.Dense(64, activation='relu'),
    keras.layers.LayerNormalization(),
    keras.layers.Dropout(0.2),
    
    keras.layers.Dense(32, activation='relu'),
    keras.layers.LayerNormalization(),

    keras.layers.Dense(1)
])

# Biên dịch mô hình
model.compile(optimizer='adam', loss=tf.keras.losses.MeanSquaredError())


In [None]:
X_train.shape[1],

In [None]:
history = model.fit(X_train, Y_train, epochs=100, batch_size=2048, validation_data=(X_test, Y_test))

In [None]:
model.save('model/NN1.3')

In [None]:
model.summary()

In [None]:
# Đánh giá mô hình trên dữ liệu kiểm tra
pred = model.predict(X_test)
# print("Mean Squared Error:", mse)
# print("Mean Absolute Error:", mae)

In [None]:
print(f"MSE: {mean_squared_error(Y_test, pred):.4f} \n MAE: {mean_absolute_error(Y_test, pred):.4f} \n R^2 {r2_score(Y_test, pred)}")

In [None]:
print(f"MSE: {mean_squared_error(Y_test, pred):.4f} \n MAE: {mean_absolute_error(Y_test, pred):.4f} \n R^2 {r2_score(Y_test, pred)}")