# Modeling: KNN, Decision Tree, Random Forest, Neural Network

In [142]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
import pickle
from sklearn.neighbors import KNeighborsRegressor
from sklearn.utils.validation import check_is_fitted
from sklearn.exceptions import NotFittedError

sns.set(style='whitegrid')

In [143]:
# Load data
FILE_PATH = "/media/hoang/HDD_Code/Tài liệu học tập/Kỳ 1 năm 4/Khai phá dữ liệu/mobiles_dataset_2025_clustered_labeled.csv"
df = pd.read_csv(FILE_PATH)
print(f"Loaded {len(df)} rows and {len(df.columns)} columns")

# Target and feature selection
TARGET = 'Launched Price (USA)'

# If your file contains columns with encoded companies/processors like in prior notebook, include them.
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if TARGET not in numeric_cols:
    raise KeyError(f"Target column '{TARGET}' not found as numeric column in dataframe")
feature_cols = [c for c in numeric_cols if c != TARGET]

# Minimal cleaning: drop rows with missing target, impute numeric features later in pipeline
df = df.dropna(subset=[TARGET]).reset_index(drop=True)
X = df[feature_cols]
y = df[TARGET]

print(f"Using {len(feature_cols)} features for modeling")


Loaded 908 rows and 17 columns
Using 16 features for modeling


In [144]:
df.head()

Unnamed: 0,RAM,Front Camera,Back Camera,Battery Capacity,Screen Size,Launched Price (USA),ROM,Company_Apple,Company_Honor,Company_Oppo,Company_Other,Company_Samsung,Company_Vivo,Processor_vec1,Processor_vec2,Processor_vec3,Cluster
0,6,1.2,4.8,3.6,6.1,7.99,2.0,1,0,0,0,0,0,-0.105389,-0.710295,-0.298391,0
1,6,1.2,4.8,3.6,6.1,8.49,4.0,1,0,0,0,0,0,-0.105389,-0.710295,-0.298391,0
2,6,1.2,4.8,3.6,6.1,8.99,8.0,1,0,0,0,0,0,-0.105389,-0.710295,-0.298391,0
3,6,1.2,4.8,4.2,6.7,8.99,2.0,1,0,0,0,0,0,-0.105389,-0.710295,-0.298391,0
4,6,1.2,4.8,4.2,6.7,9.49,4.0,1,0,0,0,0,0,-0.105389,-0.710295,-0.298391,0


In [145]:
# Train/test split
RANDOM_STATE = 42
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

# Preprocessing pipeline: impute then scale (fit on train)
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

X_train_prep = numeric_pipeline.fit_transform(X_train)
X_test_prep = numeric_pipeline.transform(X_test)

print('Shapes:', X_train_prep.shape, X_test_prep.shape)


Shapes: (726, 16) (182, 16)


In [146]:
print("Current X columns:", X.columns.tolist())
print("Number of features:", X_train_prep.shape[1])

Current X columns: ['RAM', 'Front Camera', 'Back Camera', 'Battery Capacity', 'Screen Size', 'ROM', 'Company_Apple', 'Company_Honor', 'Company_Oppo', 'Company_Other', 'Company_Samsung', 'Company_Vivo', 'Processor_vec1', 'Processor_vec2', 'Processor_vec3', 'Cluster']
Number of features: 16


In [147]:
def evaluate_regression(true, pred):
    mae = mean_absolute_error(true, pred)
    rmse = np.sqrt(mean_squared_error(true, pred))
    r2 = r2_score(true, pred)
    return {'MAE': mae, 'RMSE': rmse, 'R2': r2}

def print_eval(name, true, pred):
    res = evaluate_regression(true, pred)
    print(f"{name}: MAE={res['MAE']:.3f}, RMSE={res['RMSE']:.3f}, R2={res['R2']:.3f}")
    return res

def _is_fitted(est):
    try:
        check_is_fitted(est)
        return True
    except (NotFittedError, AttributeError):
        return False

In [148]:

# Ensure estimator objects exist
# train with knn
if 'best_knn' in globals() and isinstance(best_knn, int):
    best_knn = KNeighborsRegressor(n_neighbors=best_knn)
elif 'best_knn' not in globals():
    best_knn = KNeighborsRegressor(n_neighbors=5)

# train with decision tree
if 'best_dt' not in globals():
    best_dt = DecisionTreeRegressor(random_state=RANDOM_STATE)
# train with random forest
if 'rf' not in globals():
    try:
        # Load the best model from pickle file
        with open('best_random_forest.pkl', 'rb') as f:
            rf = pickle.load(f)
        print("Loaded best Random Forest model from pickle file")
    except FileNotFoundError:
        # Fall back to default parameters if pickle file not found
        rf = RandomForestRegressor(
            n_estimators=200,          
            max_depth=None,            
            min_samples_split=5,       
            min_samples_leaf=2,        
            max_features='sqrt',       
            bootstrap=True,            
            random_state=RANDOM_STATE,
            n_jobs=-1,                 
            max_samples=0.8,          
            criterion='squared_error', 
            oob_score=True            
        )
        print("Created new Random Forest model with default parameters")
# Fit if needed (use train data already prepared)
for name, est in [('best_knn', best_knn), ('best_dt', best_dt), ('rf', rf)]:
    if not _is_fitted(est):
        print(f"{name} not fitted -> fitting now")
        est.fit(X_train, y_train)
    else:
        print(f"{name} already fitted")
# Build pipelines that include the fitted numeric_pipeline (so preprocessing is saved together)
from sklearn.pipeline import Pipeline as SKPipeline
knn_pipe = SKPipeline([('preprocessor', numeric_pipeline), ('model', best_knn)])
dt_pipe  = SKPipeline([('preprocessor', numeric_pipeline), ('model', best_dt)])
rf_pipe  = SKPipeline([('preprocessor', numeric_pipeline), ('model', rf)])


best_knn already fitted
best_dt already fitted
rf already fitted


In [149]:
# First, let's check and debug the data shapes
print("Debug information:")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"X_train_prep shape: {X_train_prep.shape}")
print(f"X_test_prep shape: {X_test_prep.shape}")
print("\nFeature columns:", X.columns.tolist())
# Evaluate models
results = {}

# KNN evaluation
try:
    knn_pred = best_knn.predict(X_test_prep)
    results['KNN'] = print_eval('KNN', y_test, knn_pred)
except Exception as e:
    print(f"KNN prediction failed: {e}")

# Decision Tree evaluation    
try:
    dt_pred = best_dt.predict(X_test_prep)
    results['DecisionTree'] = print_eval('DecisionTree', y_test, dt_pred)
except Exception as e:
    print(f"Decision Tree prediction failed: {e}")

# Random Forest evaluation
try:
    rf_pred = rf.predict(X_test_prep)
    results['RandomForest'] = print_eval('RandomForest', y_test, rf_pred)
except Exception as e:
    print(f"Random Forest prediction failed: {e}")

# Display results
summary = pd.DataFrame(results).T
display(summary)

Debug information:
X_train shape: (726, 16)
X_test shape: (182, 16)
X_train_prep shape: (726, 16)
X_test_prep shape: (182, 16)

Feature columns: ['RAM', 'Front Camera', 'Back Camera', 'Battery Capacity', 'Screen Size', 'ROM', 'Company_Apple', 'Company_Honor', 'Company_Oppo', 'Company_Other', 'Company_Samsung', 'Company_Vivo', 'Processor_vec1', 'Processor_vec2', 'Processor_vec3', 'Cluster']
KNN: MAE=2.879, RMSE=4.159, R2=-0.041
DecisionTree: MAE=3.428, RMSE=4.646, R2=-0.299
RandomForest: MAE=2.980, RMSE=3.729, R2=0.163




Unnamed: 0,MAE,RMSE,R2
KNN,2.879335,4.159002,-0.041228
DecisionTree,3.427698,4.646051,-0.299379
RandomForest,2.979988,3.729003,0.162946


In [150]:
# Save sklearn models with pkl
import pickle
from sklearn.pipeline import Pipeline

# Tạo pipeline chứa cả preprocessing đã fit + model (tiện cho inference)
knn_pipe = Pipeline([('preprocessor', numeric_pipeline), ('model', best_knn)])
dt_pipe  = Pipeline([('preprocessor', numeric_pipeline), ('model', best_dt)])
rf_pipe  = Pipeline([('preprocessor', numeric_pipeline), ('model', rf)])

# Lưu bằng pickle (.pkl) 
with open('knn_model.pkl', 'wb') as f:
    pickle.dump(knn_pipe, f)
with open('decision_tree_model.pkl', 'wb') as f:
    pickle.dump(dt_pipe, f)
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(rf_pipe, f)

print('Saved: knn_model.pkl, decision_tree_model.pkl, random_forest_model.pkl,')
print('       knn_model.joblib, decision_tree_model.joblib, random_forest_model.joblib,')


Saved: knn_model.pkl, decision_tree_model.pkl, random_forest_model.pkl,
       knn_model.joblib, decision_tree_model.joblib, random_forest_model.joblib,


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import pickle

class MobilePreprocessor:
    def __init__(self):
        # Define features in exact order from model
        self.required_columns = [
            'RAM', 'ROM', 'Front Camera', 'Back Camera',
            'Battery Capacity', 'Screen Size',
            'Processor_vec1', 'Processor_vec2', 'Processor_vec3',
            'Company_Apple', 'Company_Honor', 'Company_Oppo',
            'Company_Other', 'Company_Samsung', 'Company_Vivo', 'Cluster'
        ]
        self.vectorizer = None
        self.pca = None
        self.top_companies = ['Apple', 'Samsung', 'Oppo', 'Honor', 'Vivo']

    def clean_numeric(self, value, remove_str="", round_to_int=False):
        try:
            value = str(value).replace(remove_str, "").replace(",", "")
            import re
            match = re.search(r'\d+', value)
            return float(match.group()) if match else np.nan
        except:
            return np.nan

    def preprocess_input(self, phone_data):
        # Initialize output DataFrame with correct column order
        processed = pd.DataFrame(0, index=[0], columns=self.required_columns)
        
        # Process numeric features
        numeric_mappings = {
            'RAM': ('GB', True, 1),
            'ROM': ('GB', True, 1),
            'Front Camera': ('MP', False, 0.1),
            'Back Camera': ('MP', False, 0.1),
            'Battery Capacity': ('mAh', False, 0.001),
            'Screen Size': ('inches', False, 1)
        }

        for col, (unit, round_int, scale) in numeric_mappings.items():
            if col in phone_data.columns:
                val = self.clean_numeric(phone_data[col].iloc[0], unit, round_int)
                processed[col] = val * scale if not pd.isna(val) else 0

        # Company encoding
        if "Company Name" in phone_data.columns:
            company = phone_data["Company Name"].iloc[0]
            if company in self.top_companies:
                processed[f"Company_{company}"] = 1
            else:
                processed["Company_Other"] = 1

        # Processor features
        if all([self.vectorizer, self.pca, "Processor" in phone_data.columns]):
            try:
                text = phone_data["Processor"].iloc[0]
                tfidf = self.vectorizer.transform([str(text)])
                pca_result = self.pca.transform(tfidf.toarray())
                
                for i in range(3):
                    col = f"Processor_vec{i+1}"
                    processed[col] = pca_result[0,i] if i < pca_result.shape[1] else 0
            except Exception as e:
                print(f"Warning: Processor encoding failed - {e}")

        # Ensure exact column order
        return processed[self.required_columns]

    def load_preprocessor(self):
        """Load saved vectorizer and PCA"""
        try:
            with open('processor_vectorizer.pkl', 'rb') as f:
                self.vectorizer = pickle.load(f)
            with open('processor_pca.pkl', 'rb') as f:
                self.pca = pickle.load(f)
            print("✓ Loaded preprocessor components")
            return True
        except Exception as e:
            print(f"❌ Error loading preprocessor: {e}")
            return False


def predict_price(phone_data):
    """Predict phone price using preprocessed data and saved model"""
    
    # Initialize preprocessor
    preprocessor = MobilePreprocessor()
    if not preprocessor.load_preprocessor():
        return None
    
    # Preprocess input data
    processed_data = preprocessor.preprocess_input(phone_data)
    
    # Load and predict with model
    try:
        with open('best_random_forest.pkl', 'rb') as f:
            model = pickle.load(f)
        print("Model features:", model.feature_names_in_)
        prediction = model.predict(processed_data)
        return float(prediction[0]) * 100  # Convert to USD
    except Exception as e:
        print(f"❌ Prediction failed: {e}")
        return None

if __name__ == "__main__":
    preprocessor = MobilePreprocessor()
    if preprocessor.load_preprocessor():
        # Example data for iPhone 17 (predicted specs)
        iphone17_data = pd.DataFrame({
            'RAM': ['16GB'],
            'ROM': ['512GB'],
            'Front Camera': ['24MP'],
            'Back Camera': ['48MP'],
            'Battery Capacity': ['5000mAh'],
            'Screen Size': ['6.7 inches'],
            'Company Name': ['Apple'],
            'Processor': ['A16 Bionic'],
            'Cluster': [2]  
        })
        
        # Load model first to get correct feature order
        try:
            with open('best_random_forest.pkl', 'rb') as f:
                model = pickle.load(f)
            print("Model's expected features:", model.feature_names_in_)
            
            # Process and predict
            processed_data = preprocessor.preprocess_input(iphone17_data)
            
            # Ensure feature order matches model's expectations
            if hasattr(model, 'feature_names_in_'):
                processed_data = processed_data[model.feature_names_in_]
            
            print("\nProcessed features for iPhone 17:")
            print(processed_data)
            print("\nChecking for missing values:")
            print(processed_data.isna().sum())
            
            # Get price prediction
            prediction = model.predict(processed_data)
            print(f"\nPredicted Price for iPhone 17: ${float(prediction[0]) * 100:.2f}")
            
        except Exception as e:
            print(f"❌ Error: {e}")

✓ Loaded preprocessor components
Model's expected features: ['RAM' 'Front Camera' 'Back Camera' 'Battery Capacity' 'Screen Size' 'ROM'
 'Company_Apple' 'Company_Honor' 'Company_Oppo' 'Company_Other'
 'Company_Samsung' 'Company_Vivo' 'Processor_vec1' 'Processor_vec2'
 'Processor_vec3' 'Cluster']

Processed features for iPhone 17:
    RAM  Front Camera  Back Camera  Battery Capacity  Screen Size    ROM  \
0  16.0           2.4          4.8               5.0          6.0  512.0   

   Company_Apple  Company_Honor  Company_Oppo  Company_Other  Company_Samsung  \
0              1              0             0              0                0   

   Company_Vivo  Processor_vec1  Processor_vec2  Processor_vec3  Cluster  
0             0       -0.079111       -0.547869       -0.258983        0  

Checking for missing values:
RAM                 0
Front Camera        0
Back Camera         0
Battery Capacity    0
Screen Size         0
ROM                 0
Company_Apple       0
Company_Honor      

Notes:
- Inspect feature importance from RandomForest via rf.feature_importances_ if desired.
- Further tuning (e.g., RandomizedSearchCV), target transformation (log), or categorical encoding may improve performance.
- Adjust dataset path and feature selection according to your processed CSV structure.