In [74]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor, ExtraTreesRegressor,
    GradientBoostingRegressor, AdaBoostRegressor
)
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.utils.validation import check_is_fitted

In [75]:
df = pd.read_csv('/content/smartphonecleaned_data.csv')
df.head()

Unnamed: 0,brand_name,has_5g,processor_brand,processor_speed,Battery,RAM,Storage,screen_size,num_rear_cameras,os,primary_camera_rear,primary_camera_front,price
0,oneplus,True,snapdragon,3.2,5000,12,256,6.7,3,android,50.0,16.0,54999
1,oneplus,True,snapdragon,2.2,5000,6,128,6.6,3,android,64.0,16.0,19989
2,samsung,True,exynos,2.4,5000,4,64,6.6,3,android,50.0,13.0,16499
3,motorola,True,snapdragon,2.2,5000,6,128,6.6,3,android,50.0,16.0,14999
4,realme,True,dimensity,2.6,5000,6,128,6.7,3,android,108.0,16.0,24999


In [76]:
X = df.drop(columns=['price'])
y = df['price']

In [77]:
y_transformed = np.log1p(y)

#Ordinal encoding

In [96]:
columns_to_encode = ['brand_name','has_5g']

In [97]:
# Define the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['processor_speed', 'Battery', 'RAM', 'Storage', 'num_rear_cameras', 'primary_camera_rear', 'primary_camera_front']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1', OneHotEncoder(handle_unknown='ignore'), ['os', 'processor_brand'])
    ],
    remainder='passthrough'
)

SyntaxError: closing parenthesis ']' does not match opening parenthesis '(' on line 6 (<ipython-input-97-3b8da418da96>, line 7)

In [108]:
def scorer(model_name, model):

    output = []

    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

    output.append(scores.mean())

    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

    pipeline.fit(X_train,y_train)

    y_pred = pipeline.predict(X_test)

    y_pred = np.expm1(y_pred)

    output.append(mean_absolute_error(np.expm1(y_test),y_pred))

    return output


In [109]:
model_dict = {
    "linear_reg": LinearRegression(),
    "svr": SVR(),
    "ridge": Ridge(),
    "lasso": Lasso(),
    "decision_tree": DecisionTreeRegressor(),
    "random_forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "extra_trees": ExtraTreesRegressor(n_estimators=100, random_state=42),
    "gradient_boosting": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
    "adaboost": AdaBoostRegressor(n_estimators=50, random_state=42),
    "mlp": MLPRegressor(hidden_layer_sizes=(100,), max_iter=500, random_state=42),
    "GBoosting" : GradientBoostingRegressor(n_estimators=400,
    learning_rate=0.05,
    max_depth=8,
    subsample=0.7,
    random_state=42),
}

In [110]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [111]:
model_df = pd.DataFrame(model_output, columns=['model_name', 'r2_score', 'mae'])

In [112]:
model_df.sort_values(['mae'])

Unnamed: 0,model_name,r2_score,mae
7,gradient_boosting,0.911964,5263.823356
10,GBoosting,0.909808,5526.368872
6,extra_trees,0.903407,5750.340506
5,random_forest,0.903006,5922.810469
1,svr,0.823175,7467.921943
8,adaboost,0.854346,7731.774434
4,decision_tree,0.811313,8928.372374
9,mlp,0.783637,9068.342715
2,ridge,0.836783,9646.100687
0,linear_reg,0.836714,9672.261341


In [113]:
from sklearn.model_selection import GridSearchCV

In [114]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__max_samples':[0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['auto', 'sqrt']
}

In [116]:
columns_to_encode = ['brand_name','has_5g']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['processor_speed', 'Battery', 'RAM', 'Storage', 'num_rear_cameras', 'primary_camera_rear', 'primary_camera_front']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1', OneHotEncoder(handle_unknown='ignore'), ['os', 'processor_brand'])
    ],
    remainder='passthrough'
)

In [117]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [118]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [119]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [120]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 128 candidates, totalling 1280 fits


640 fits failed out of a total of 1280.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
314 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1382,

In [121]:
final_pipe = search.best_estimator_

In [122]:
search.best_params_

{'regressor__max_depth': 20,
 'regressor__max_features': 'sqrt',
 'regressor__max_samples': 1.0,
 'regressor__n_estimators': 300}

In [123]:
search.best_score_

0.9149869494816644

In [124]:
final_pipe.fit(X,y_transformed)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [125]:
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [126]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)