# Mobile Price Classification - Master Notebook

## All 6 Models Training and Evaluation

This notebook trains and evaluates all 6 machine learning models:

1. **Logistic Regression**

2. **Decision Tree**

3. **K-Nearest Neighbors**

4. **Naive Bayes**

5. **Random Forest**

6. **XGBoost**

### Features:

- Automated training of all models

- Comprehensive evaluation metrics

- Visual comparisons

- Confusion matrices for each model

- Performance benchmarking

---

## Import Libraries

In [2]:
#IMPORT LIBRARIES
# =============================================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef,
    confusion_matrix, classification_report
)
import pickle
import time
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("MOBILE PRICE CLASSIFICATION - ALL 6 MODELS")
print("="*80)
print(f"Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("\n‚úì All libraries imported successfully\n")

MOBILE PRICE CLASSIFICATION - ALL 6 MODELS
Started at: 2026-01-12 23:03:53

‚úì All libraries imported successfully



## Load Dataset

In [6]:
#LOAD DATASET
# =============================================================================
print("="*80)
print("LOADING DATASET")
print("="*80)
df = pd.read_csv('../data/train.csv')
print(f"‚úì Dataset loaded: {df.shape}")
print(f" Features: {df.shape[1] - 1}")
print(f" Samples: {df.shape[0]}")
print(f" Missing values: {df.isnull().sum().sum()}")
print(f"\nTarget distribution:")
print(df['price_range'].value_counts().sort_index())

LOADING DATASET
‚úì Dataset loaded: (2000, 21)
 Features: 20
 Samples: 2000
 Missing values: 0

Target distribution:
price_range
0    500
1    500
2    500
3    500
Name: count, dtype: int64


## Data Preparation

In [7]:
#DATA PREPARATION
# =============================================================================
print("\n" + "="*80)
print("DATA PREPARATION")
print("="*80)
# Separate features and target
X = df.drop('price_range', axis=1)
y = df['price_range']
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training samples: {X_train.shape[0]} ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Test samples: {X_test.shape[0]} ({X_test.shape[0]/len(X)*100:.1f}%)")
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("‚úì Features scaled")
# Save scaler
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print("‚úì Scaler saved\n")


DATA PREPARATION
Training samples: 1600 (80.0%)
Test samples: 400 (20.0%)
‚úì Features scaled
‚úì Scaler saved



## Helper Function

In [8]:
#HELPER FUNCTION
# =============================================================================
def evaluate_model(model, X_test, y_test, model_name, use_scaled=False):
    """Evaluate model and return metrics"""
    X_test_eval = X_test_scaled if use_scaled else X_test
    # Predictions
    y_pred = model.predict(X_test_eval)
    y_pred_proba = model.predict_proba(X_test_eval) if hasattr(model, 'predict_proba') else None
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    mcc = matthews_corrcoef(y_test, y_pred)
    if y_pred_proba is not None:
        y_test_bin = label_binarize(y_test, classes=[0, 1, 2, 3])
        auc = roc_auc_score(y_test_bin, y_pred_proba, multi_class='ovr', average='weighted')
    else:
        auc = 0.0
    cm = confusion_matrix(y_test, y_pred)
    return {
        'model_name': model_name,
        'accuracy': round(accuracy, 4),
        'auc': round(auc, 4),
        'precision': round(precision, 4),
        'recall': round(recall, 4),
        'f1': round(f1, 4),
        'mcc': round(mcc, 4),
        'confusion_matrix': cm
    }

## Train All Models

In [9]:
#TRAIN ALL MODELS
# =============================================================================
print("="*80)
print("TRAINING ALL 6 MODELS")
print("="*80)
all_results = []
training_times = []
# MODEL 1: LOGISTIC REGRESSION
print("\n[1/6] Logistic Regression...")
start = time.time()
lr_model = LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1)
lr_model.fit(X_train_scaled, y_train)
lr_time = time.time() - start
lr_results = evaluate_model(lr_model, X_test, y_test, "Logistic Regression", use_scaled=True)
all_results.append(lr_results)
training_times.append(("Logistic Regression", lr_time))
with open('logistic_regression.pkl', 'wb') as f:
    pickle.dump(lr_model, f)
print(f" ‚úì Accuracy: {lr_results['accuracy']:.4f} | Time: {lr_time:.2f}s")
# MODEL 2: DECISION TREE
print("\n[2/6] Decision Tree...")
start = time.time()
dt_model = DecisionTreeClassifier(max_depth=10, random_state=42)
dt_model.fit(X_train, y_train)
dt_time = time.time() - start
dt_results = evaluate_model(dt_model, X_test, y_test, "Decision Tree", use_scaled=False)
all_results.append(dt_results)
training_times.append(("Decision Tree", dt_time))
with open('decision_tree.pkl', 'wb') as f:
    pickle.dump(dt_model, f)
print(f" ‚úì Accuracy: {dt_results['accuracy']:.4f} | Time: {dt_time:.2f}s")
# MODEL 3: K-NEAREST NEIGHBORS
print("\n[3/6] K-Nearest Neighbors...")
start = time.time()
knn_model = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
knn_model.fit(X_train_scaled, y_train)
knn_time = time.time() - start
knn_results = evaluate_model(knn_model, X_test, y_test, "K-Nearest Neighbors", use_scaled=True)
all_results.append(knn_results)
training_times.append(("K-Nearest Neighbors", knn_time))
with open('knn.pkl', 'wb') as f:
    pickle.dump(knn_model, f)
print(f" ‚úì Accuracy: {knn_results['accuracy']:.4f} | Time: {knn_time:.2f}s")
# MODEL 4: NAIVE BAYES
print("\n[4/6] Naive Bayes...")
start = time.time()
nb_model = GaussianNB()
nb_model.fit(X_train_scaled, y_train)
nb_time = time.time() - start
nb_results = evaluate_model(nb_model, X_test, y_test, "Naive Bayes", use_scaled=True)
all_results.append(nb_results)
training_times.append(("Naive Bayes", nb_time))
with open('naive_bayes.pkl', 'wb') as f:
    pickle.dump(nb_model, f)
print(f" ‚úì Accuracy: {nb_results['accuracy']:.4f} | Time: {nb_time:.2f}s")
# MODEL 5: RANDOM FOREST
print("\n[5/6] Random Forest...")
start = time.time()
rf_model = RandomForestClassifier(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)
rf_time = time.time() - start
rf_results = evaluate_model(rf_model, X_test, y_test, "Random Forest", use_scaled=False)
all_results.append(rf_results)
training_times.append(("Random Forest", rf_time))
with open('random_forest.pkl', 'wb') as f:
    pickle.dump(rf_model, f)
print(f" ‚úì Accuracy: {rf_results['accuracy']:.4f} | Time: {rf_time:.2f}s")
# MODEL 6: XGBOOST
print("\n[6/6] XGBoost...")
start = time.time()
xgb_model = XGBClassifier(random_state=42, eval_metric='mlogloss', n_jobs=-1, verbosity=0)
xgb_model.fit(X_train, y_train)
xgb_time = time.time() - start
xgb_results = evaluate_model(xgb_model, X_test, y_test, "XGBoost", use_scaled=False)
all_results.append(xgb_results)
training_times.append(("XGBoost", xgb_time))
with open('xgboost.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)
print(f" ‚úì Accuracy: {xgb_results['accuracy']:.4f} | Time: {xgb_time:.2f}s")

TRAINING ALL 6 MODELS

[1/6] Logistic Regression...
 ‚úì Accuracy: 0.9650 | Time: 7.29s

[2/6] Decision Tree...
 ‚úì Accuracy: 0.8200 | Time: 0.05s

[3/6] K-Nearest Neighbors...
 ‚úì Accuracy: 0.5000 | Time: 0.00s

[4/6] Naive Bayes...
 ‚úì Accuracy: 0.8100 | Time: 0.00s

[5/6] Random Forest...
 ‚úì Accuracy: 0.8950 | Time: 0.47s

[6/6] XGBoost...
 ‚úì Accuracy: 0.9350 | Time: 0.69s


## Comparison Table

In [11]:
#COMPARISON TABLE
# =============================================================================
print("\n" + "="*80)
print("MODEL COMPARISON TABLE")
print("="*80)
comparison_df = pd.DataFrame([{
    'ML Model Name': r['model_name'],
    'Accuracy': r['accuracy'],
    'AUC': r['auc'],
    'Precision': r['precision'],
    'Recall': r['recall'],
    'F1': r['f1'],
    'MCC': r['mcc']
} for r in all_results])
print(comparison_df.to_string(index=False))
# Save comparison table
comparison_df.to_csv('model_comparison.csv', index=False)
print("\n‚úì Comparison table saved to: model_comparison.csv")


MODEL COMPARISON TABLE
      ML Model Name  Accuracy    AUC  Precision  Recall     F1    MCC
Logistic Regression     0.965 0.9987     0.9650   0.965 0.9650 0.9534
      Decision Tree     0.820 0.8802     0.8241   0.820 0.8208 0.7607
K-Nearest Neighbors     0.500 0.7697     0.5211   0.500 0.5054 0.3350
        Naive Bayes     0.810 0.9506     0.8113   0.810 0.8105 0.7468
      Random Forest     0.895 0.9811     0.8942   0.895 0.8942 0.8602
            XGBoost     0.935 0.9945     0.9355   0.935 0.9350 0.9135

‚úì Comparison table saved to: model_comparison.csv


## Best Model

In [12]:
#BEST MODEL
# =============================================================================
print("\n" + "="*80)
print("BEST PERFORMING MODEL")
print("="*80)
best_idx = comparison_df['Accuracy'].idxmax()
best_model = comparison_df.loc[best_idx]
print(f"üèÜ Model: {best_model['ML Model Name']}")
print(f" Accuracy: {best_model['Accuracy']:.4f}")
print(f" AUC: {best_model['AUC']:.4f}")
print(f" F1 Score: {best_model['F1']:.4f}")
print(f" MCC: {best_model['MCC']:.4f}")


BEST PERFORMING MODEL
üèÜ Model: Logistic Regression
 Accuracy: 0.9650
 AUC: 0.9987
 F1 Score: 0.9650
 MCC: 0.9534


## Training Time Summary

In [13]:
#TRAINING TIME SUMMARY
# =============================================================================
print("\n" + "="*80)
print("TRAINING TIME SUMMARY")
print("="*80)
time_df = pd.DataFrame(training_times, columns=['Model', 'Time (seconds)'])
print(time_df.to_string(index=False))
total_time = sum([t for _, t in training_times])
print(f"\nTotal training time: {total_time:.2f} seconds ({total_time/60:.2f} minutes)")


TRAINING TIME SUMMARY
              Model  Time (seconds)
Logistic Regression        7.291422
      Decision Tree        0.046158
K-Nearest Neighbors        0.002849
        Naive Bayes        0.004167
      Random Forest        0.472220
            XGBoost        0.690800

Total training time: 8.51 seconds (0.14 minutes)


## Confusion Matrices

In [14]:
#CONFUSION MATRICES
# =============================================================================
print("\n" + "="*80)
print("CONFUSION MATRICES")
print("="*80)
for result in all_results:
    print(f"\n{result['model_name']}:")
    print(result['confusion_matrix'])


CONFUSION MATRICES

Logistic Regression:
[[98  2  0  0]
 [ 1 96  3  0]
 [ 0  2 94  4]
 [ 0  0  2 98]]

Decision Tree:
[[92  8  0  0]
 [13 74 13  0]
 [ 0 15 79  6]
 [ 0  0 17 83]]

K-Nearest Neighbors:
[[70 26  4  0]
 [33 38 24  5]
 [ 5 39 41 15]
 [ 0 13 36 51]]

Naive Bayes:
[[90 10  0  0]
 [ 7 69 24  0]
 [ 0 18 73  9]
 [ 0  0  8 92]]

Random Forest:
[[98  2  0  0]
 [ 7 81 12  0]
 [ 0 11 84  5]
 [ 0  0  5 95]]

XGBoost:
[[100   0   0   0]
 [  3  93   4   0]
 [  0   6  90   4]
 [  0   0   9  91]]


## Visualizations

In [None]:
#VISUALIZATIONS
# =============================================================================
print("\n" + "="*80)
print("CREATING VISUALIZATIONS")
print("="*80)
# Create comprehensive visualization
fig = plt.figure(figsize=(18, 12))
# Plot 1: Performance Metrics Comparison
ax1 = plt.subplot(2, 3, 1)
metrics = ['Accuracy', 'Precision', 'Recall', 'F1']
x = np.arange(len(comparison_df))
width = 0.2
colors = ['#3498db', '#2ecc71', '#f39c12', '#e74c3c']
for i, metric in enumerate(metrics):
    ax1.bar(x + i*width, comparison_df[metric], width, label=metric, color=colors[i])
ax1.set_xlabel('Models', fontweight='bold')
ax1.set_ylabel('Score', fontweight='bold')
ax1.set_title('Performance Metrics Comparison', fontsize=12, fontweight='bold')
ax1.set_xticks(x + width * 1.5)
ax1.set_xticklabels(comparison_df['ML Model Name'], rotation=45, ha='right', fontsize=9)
ax1.legend(loc='lower right', fontsize=9)
ax1.grid(axis='y', alpha=0.3)
ax1.set_ylim([0, 1.1])
# Plot 2: Accuracy Ranking
ax2 = plt.subplot(2, 3, 2)
sorted_df = comparison_df.sort_values('Accuracy', ascending=True)
bars = ax2.barh(sorted_df['ML Model Name'], sorted_df['Accuracy'], color='skyblue')
ax2.set_xlabel('Accuracy', fontweight='bold')
ax2.set_title('Model Accuracy Ranking', fontsize=12, fontweight='bold')
ax2.set_xlim([0, 1.0])
ax2.grid(axis='x', alpha=0.3)
for bar in bars:
    width_val = bar.get_width()
    ax2.text(width_val + 0.01, bar.get_y() + bar.get_height()/2,
        f'{width_val:.4f}', ha='left', va='center', fontsize=9, fontweight='bold')
# Plot 3: AUC Comparison
ax3 = plt.subplot(2, 3, 3)
ax3.bar(comparison_df['ML Model Name'], comparison_df['AUC'], color='coral')
ax3.set_ylabel('AUC Score', fontweight='bold')
ax3.set_title('AUC Score Comparison', fontsize=12, fontweight='bold')
ax3.set_xticklabels(comparison_df['ML Model Name'], rotation=45, ha='right', fontsize=9)
ax3.grid(axis='y', alpha=0.3)
ax3.set_ylim([0, 1.1])
# Plot 4: F1 Score Comparison
ax4 = plt.subplot(2, 3, 4)
ax4.bar(comparison_df['ML Model Name'], comparison_df['F1'], color='lightgreen')
ax4.set_ylabel('F1 Score', fontweight='bold')
ax4.set_title('F1 Score Comparison', fontsize=12, fontweight='bold')
ax4.set_xticklabels(comparison_df['ML Model Name'], rotation=45, ha='right', fontsize=9)
ax4.grid(axis='y', alpha=0.3)
ax4.set_ylim([0, 1.1])
# Plot 5: MCC Comparison
ax5 = plt.subplot(2, 3, 5)
ax5.bar(comparison_df['ML Model Name'], comparison_df['MCC'], color='plum')
ax5.set_ylabel('MCC Score', fontweight='bold')
ax5.set_title('MCC Score Comparison', fontsize=12, fontweight='bold')
ax5.set_xticklabels(comparison_df['ML Model Name'], rotation=45, ha='right', fontsize=9)
ax5.grid(axis='y', alpha=0.3)
# Plot 6: Training Time
ax6 = plt.subplot(2, 3, 6)
ax6.bar(time_df['Model'], time_df['Time (seconds)'], color='gold')
ax6.set_ylabel('Time (seconds)', fontweight='bold')
ax6.set_title('Training Time Comparison', fontsize=12, fontweight='bold')
ax6.set_xticklabels(time_df['Model'], rotation=45, ha='right', fontsize=9)
ax6.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('model_comparison_visualization.png', dpi=300, bbox_inches='tight')
plt.show()
print("‚úì Visualization saved as: model_comparison_visualization.png")


CREATING VISUALIZATIONS


## Individual Confusion Matrices

In [None]:
INDIVIDUAL CONFUSION MATRICES
# =============================================================================
print("\nCreating individual confusion matrix plots...")
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()
for idx, result in enumerate(all_results):
    sns.heatmap(result['confusion_matrix'], annot=True, fmt='d', cmap='Blues',
        xticklabels=['Low', 'Med', 'High', 'V.High'],
        yticklabels=['Low', 'Med', 'High', 'V.High'],
        ax=axes[idx], cbar_kws={'label': 'Count'})
    axes[idx].set_xlabel('Predicted', fontweight='bold')
    axes[idx].set_ylabel('Actual', fontweight='bold')
    axes[idx].set_title(f'{result["model_name"]}', fontsize=11, fontweight='bold')
plt.tight_layout()
plt.savefig('all_confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.show()
print("‚úì Confusion matrices saved as: all_confusion_matrices.png")

## Final Summary

In [None]:
FINAL SUMMARY
# =============================================================================
print("\n" + "="*80)
print("‚úì ALL MODELS TRAINED AND EVALUATED SUCCESSFULLY!")
print("="*80)
print("\nGenerated Files:")
files = [
    'logistic_regression.pkl',
    'decision_tree.pkl',
    'knn.pkl',
    'naive_bayes.pkl',
    'random_forest.pkl',
    'xgboost.pkl',
    'scaler.pkl',
    'model_comparison.csv',
    'model_comparison_visualization.png',
    'all_confusion_matrices.png'
]
for file in files:
    print(f" ‚úì {file}")
print("\n" + "="*80)
print(f"Completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*80)
print("\nüìä NEXT STEPS:")
print("1. Review model_comparison.csv for detailed metrics")
print("2. Check visualizations (PNG files)")
print("3. Run: streamlit run app.py")
print("4. Upload test data and get predictions")
print("\n" + "="*80)