In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    AdaBoostRegressor,
    ExtraTreesRegressor,
)

from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor, StackingRegressor
import pickle

In [2]:
filename = "../../data/processed/final-df.csv"
df = pd.read_csv(filename)
df.drop(columns=["Unnamed: 0"], inplace=True)
df.head()

Unnamed: 0,Company,TypeName,Ram,Weight,Price,Touchscreen,IPS,PPI,Cpu Brand,HDD,SSD,Gpu Brand,os
0,Apple,Ultrabook,8,1.37,71378.6832,0,1,272.209524,Intel Core i5,0,128,Intel,MacOS
1,Apple,Ultrabook,8,1.34,47895.5232,0,0,153.117857,Intel Core i5,0,0,Intel,MacOS
2,HP,Notebook,8,1.86,30636.0,0,0,174.05705,Intel Core i5,0,256,Intel,No OS / Chrome OS / Android
3,Apple,Ultrabook,16,1.83,135195.336,0,1,264.476309,Intel Core i7,0,512,AMD,MacOS
4,Apple,Ultrabook,8,1.37,96095.808,0,1,272.209524,Intel Core i5,0,256,Intel,MacOS


In [3]:
X = df.drop(columns=["Price"])
y = np.log(df["Price"])
X_train, X_test, Y_train, Y_test = train_test_split(
    X, y, test_size=0.15, random_state=2
)

In [4]:
step1 = ColumnTransformer(
    transformers=[
        (
            "col_tnf",
            OneHotEncoder(sparse=False, drop="first", handle_unknown="ignore"),
            [0, 1, 7, 10, 11],
        )
    ],
    remainder="passthrough",
)

step2 = LinearRegression()
pipe = Pipeline([("step1", step1), ("step2", step2)])
pipe.fit(X_train, Y_train)
Y_pred = pipe.predict(X_test)
print("R2 score", r2_score(Y_test, Y_pred))
print("MAE", mean_absolute_error(Y_test, Y_pred))

R2 score 0.8085849153189795
MAE 0.20885493278890085


In [5]:
step1 = ColumnTransformer(
    transformers=[
        ("col_tnf", OneHotEncoder(sparse=False, drop="first"), [0, 1, 7, 10, 11])
    ],
    remainder="passthrough",
)

step2 = Ridge(alpha=10)
pipe = Pipeline([("step1", step1), ("step2", step2)])
pipe.fit(X_train, Y_train)
Y_pred = pipe.predict(X_test)
print("R2 score", r2_score(Y_test, Y_pred))
print("MAE", mean_absolute_error(Y_test, Y_pred))

R2 score 0.8120148508584046
MAE 0.21035102823966753


In [6]:
step1 = ColumnTransformer(
    transformers=[
        (
            "col_tnf",
            OneHotEncoder(sparse=False, drop="first", handle_unknown="ignore"),
            [0, 1, 7, 10, 11],
        )
    ],
    remainder="passthrough",
)

step2 = Lasso(alpha=0.001)
pipe = Pipeline([("step1", step1), ("step2", step2)])
pipe.fit(X_train, Y_train)
Y_pred = pipe.predict(X_test)
print("R2 score", r2_score(Y_test, Y_pred))
print("MAE", mean_absolute_error(Y_test, Y_pred))

R2 score 0.8077332890306506
MAE 0.21147148574247662


In [7]:
step1 = ColumnTransformer(
    transformers=[
        (
            "col_tnf",
            OneHotEncoder(sparse=False, drop="first", handle_unknown="ignore"),
            [0, 1, 7, 10, 11],
        )
    ],
    remainder="passthrough",
)

step2 = KNeighborsRegressor(n_neighbors=3)
pipe = Pipeline([("step1", step1), ("step2", step2)])
pipe.fit(X_train, Y_train)
Y_pred = pipe.predict(X_test)
print("R2 score", r2_score(Y_test, Y_pred))
print("MAE", mean_absolute_error(Y_test, Y_pred))

R2 score 0.8100380147383134
MAE 0.19071529891720201


In [8]:
step1 = ColumnTransformer(
    transformers=[
        (
            "col_tnf",
            OneHotEncoder(sparse=False, drop="first", handle_unknown="ignore"),
            [0, 1, 7, 10, 11],
        )
    ],
    remainder="passthrough",
)

step2 = DecisionTreeRegressor(max_depth=8)
pipe = Pipeline([("step1", step1), ("step2", step2)])
pipe.fit(X_train, Y_train)
Y_pred = pipe.predict(X_test)
print("R2 score", r2_score(Y_test, Y_pred))
print("MAE", mean_absolute_error(Y_test, Y_pred))

R2 score 0.8319049352444134
MAE 0.1847732978970104


In [9]:
step1 = ColumnTransformer(
    transformers=[
        (
            "col_tnf",
            OneHotEncoder(sparse=False, drop="first", handle_unknown="ignore"),
            [0, 1, 7, 10, 11],
        )
    ],
    remainder="passthrough",
)

step2 = SVR(kernel="rbf", C=10000, epsilon=0.1)
pipe = Pipeline([("step1", step1), ("step2", step2)])
pipe.fit(X_train, Y_train)
Y_pred = pipe.predict(X_test)
print("R2 score", r2_score(Y_test, Y_pred))
print("MAE", mean_absolute_error(Y_test, Y_pred))

R2 score 0.809037049294698
MAE 0.2029010953980002


In [10]:
step1 = ColumnTransformer(
    transformers=[
        (
            "col_tnf",
            OneHotEncoder(sparse=False, drop="first", handle_unknown="ignore"),
            [0, 1, 7, 10, 11],
        )
    ],
    remainder="passthrough",
)

step2 = RandomForestRegressor(
    n_estimators=100, random_state=3, max_samples=0.5, max_features=0.75, max_depth=15
)

pipe = Pipeline([("step1", step1), ("step2", step2)])
pipe.fit(X_train, Y_train)
Y_pred = pipe.predict(X_test)
print("R2 score", r2_score(Y_test, Y_pred))
print("MAE", mean_absolute_error(Y_test, Y_pred))

R2 score 0.8896622683486467
MAE 0.1580559671482713


In [11]:
step1 = ColumnTransformer(
    transformers=[
        (
            "col_tnf",
            OneHotEncoder(sparse=False, drop="first", handle_unknown="ignore"),
            [0, 1, 7, 10, 11],
        )
    ],
    remainder="passthrough",
)

step2 = ExtraTreesRegressor(
    n_estimators=100,
    random_state=3,
    max_samples=0.5,
    bootstrap=True,
    max_features=0.75,
    max_depth=15,
)

pipe = Pipeline([("step1", step1), ("step2", step2)])
pipe.fit(X_train, Y_train)
Y_pred = pipe.predict(X_test)
print("R2 score", r2_score(Y_test, Y_pred))
print("MAE", mean_absolute_error(Y_test, Y_pred))

R2 score 0.8862804050684832
MAE 0.16180658368860606


In [12]:
step1 = ColumnTransformer(
    transformers=[
        (
            "col_tnf",
            OneHotEncoder(sparse=False, drop="first", handle_unknown="ignore"),
            [0, 1, 7, 10, 11],
        )
    ],
    remainder="passthrough",
)

step2 = AdaBoostRegressor(n_estimators=15, learning_rate=1.0)
pipe = Pipeline([("step1", step1), ("step2", step2)])
pipe.fit(X_train, Y_train)
Y_pred = pipe.predict(X_test)
print("R2 score", r2_score(Y_test, Y_pred))
print("MAE", mean_absolute_error(Y_test, Y_pred))

R2 score 0.7938735481736319
MAE 0.2342458538362547


In [13]:
step1 = ColumnTransformer(
    transformers=[
        (
            "col_tnf",
            OneHotEncoder(sparse=False, drop="first", handle_unknown="ignore"),
            [0, 1, 7, 10, 11],
        )
    ],
    remainder="passthrough",
)

step2 = GradientBoostingRegressor(n_estimators=500)
pipe = Pipeline([("step1", step1), ("step2", step2)])
pipe.fit(X_train, Y_train)
Y_pred = pipe.predict(X_test)
print("R2 score", r2_score(Y_test, Y_pred))
print("MAE", mean_absolute_error(Y_test, Y_pred))

R2 score 0.8784117022146991
MAE 0.16009157550844522


In [14]:
step1 = ColumnTransformer(
    transformers=[
        (
            "col_tnf",
            OneHotEncoder(sparse=False, drop="first", handle_unknown="ignore"),
            [0, 1, 7, 10, 11],
        )
    ],
    remainder="passthrough",
)

step2 = XGBRegressor(n_estimators=45, max_depth=5, learning_rate=0.5)
pipe = Pipeline([("step1", step1), ("step2", step2)])
pipe.fit(X_train, Y_train)
Y_pred = pipe.predict(X_test)
print("R2 score", r2_score(Y_test, Y_pred))
print("MAE", mean_absolute_error(Y_test, Y_pred))

R2 score 0.8985995209202559
MAE 0.15511764161025268


In [15]:
step1 = ColumnTransformer(
    transformers=[
        (
            "col_tnf",
            OneHotEncoder(sparse=False, drop="first", handle_unknown="ignore"),
            [0, 1, 7, 10, 11],
        )
    ],
    remainder="passthrough",
)

rf = RandomForestRegressor(
    n_estimators=350, random_state=3, max_samples=0.5, max_features=0.75, max_depth=15
)

gbdt = GradientBoostingRegressor(n_estimators=100, max_features=0.5)
xgb = XGBRegressor(n_estimators=25, learning_rate=0.3, max_depth=5)
et = ExtraTreesRegressor(
    n_estimators=100,
    random_state=3,
    bootstrap=True,
    max_samples=0.5,
    max_features=0.75,
    max_depth=10,
)

step2 = VotingRegressor(
    [("rf", rf), ("gbdt", gbdt), ("xgb", xgb), ("et", et)], weights=[5, 1, 1, 1]
)

pipe = Pipeline([("step1", step1), ("step2", step2)])
pipe.fit(X_train, Y_train)
Y_pred = pipe.predict(X_test)
print("R2 score", r2_score(Y_test, Y_pred))
print("MAE", mean_absolute_error(Y_test, Y_pred))

R2 score 0.8919077338617427
MAE 0.15763587053123496


In [16]:
step1 = ColumnTransformer(
    transformers=[
        (
            "col_tnf",
            OneHotEncoder(sparse=False, drop="first", handle_unknown="ignore"),
            [0, 1, 7, 10, 11],
        )
    ],
    remainder="passthrough",
)

In [17]:
estimators = [
    (
        "rf",
        RandomForestRegressor(
            n_estimators=350,
            random_state=3,
            max_samples=0.5,
            max_features=0.75,
            max_depth=15,
        ),
    ),
    ("gbdt", GradientBoostingRegressor(n_estimators=100, max_features=0.5)),
    ("xgb", XGBRegressor(n_estimators=25, learning_rate=0.3, max_depth=5)),
]

step2 = StackingRegressor(estimators=estimators, final_estimator=Ridge(alpha=100))
pipe = Pipeline([("step1", step1), ("step2", step2)])
pipe.fit(X_train, Y_train)
Y_pred = pipe.predict(X_test)
print("R2 score", r2_score(Y_test, Y_pred))
print("MAE", mean_absolute_error(Y_test, Y_pred))

R2 score 0.8862251806136849
MAE 0.1644841792824271


In [18]:
pickle.dump(df, open("../../models/df.pkl", "wb"))
pickle.dump(pipe, open("../../models/pipe.pkl", "wb"))