In [4]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt # It provides an implicit, MATLAB-like, way of plotting.

from sklearn.model_selection import train_test_split #split data

# to ignore warnings
import warnings
warnings.filterwarnings('ignore')

# build the model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay # Confusion matrix with clear numbers inside boxes

import sklearn
import sys

In [44]:
PROJECT_PATH = '/home/alo-vebrisatriadi/Documents/data-engineering/learning/mlops/'

## 2. Exploratory Data Analysis (EDA)

**Exploratory Data Analysis (EDA)** is the process of examining and visualizing a dataset to gain insights, understand its structure, and uncover patterns before any modeling. It helps in preparing the data for machine learning by identifying key trends, relationships, and potential issues like missing values or outliers.

*Key Points*:
- **Summarize Data**: Understand basic statistics (mean, median, range) and data types.
- **Visualize**: Use charts like histograms, scatter plots, and heatmaps to explore distributions and relationships.
- **Detect Issues**: Spot missing data, outliers, and anomalies that might affect modeling.
- **Guide Feature Engineering**: Helps decide which features are important and need further transformation.
EDA ensures that your data is well-understood and clean before applying any models.

In [None]:
df = pd.read_csv(PROJECT_PATH + 'data/mobile_phone_classification/train.csv')
df.head(10)

In [None]:
# Show shape 
df.shape

In [None]:
# Check duplication
df.nunique()

In [None]:
# display some information about out data

df.info()

Here ... Take a look to columns meaning, I can note that we have px_width, px_height which reference to high and width of mobile with pixels, and we have sc_h, sc_w which reference to high and width of mobile with inch ..

We need to make sure that are the values directly related ??.

I mean that if px_width, px_height are resolutions in pixels and sc_w, sc_h are screen dimensions in inches, then they should have a mathematical relationship through the PPI (pixels per inch) ratio.

We could check if a constant factor links these pairs of columns, If the ratios are consistent, it means the values are just different representations of the same information.

In [None]:
df['px_ratio_width'] = df['px_width'] / df['sc_w']
df['px_ratio_height'] = df['px_height'] / df['sc_h']
print(df[['px_ratio_width', 'px_ratio_height']].sample(10))

Presence of inf and NaN values in px_ratio_width indicates that there are issues, likely due to division by zero or very small numbers in the sc_w column. These values would not appear if the ratios were consistent.

These results imply that px_width, px_height, sc_w, and sc_h are not simply scaled versions of each other. They likely represent different aspects of the phone's characteristics (e.g., pixel density might vary significantly across different phones).

In [None]:
df.info()

In [12]:
# Remove (px_ratio_height, px_ratio_width) columns from data
df.drop(['px_ratio_height', 'px_ratio_width'], axis=1, inplace=True)

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print(df.describe())

This previous function provides a set of summary statistics that help you better understand the distribution of data, including:

- Count: the number of non-empty values in each column.
- Arithmetic mean( mean): the average of the values of the numerical column.
- Standard deviation (std): a measure of the spread of data about the arithmetic mean.
- Minimum (min): the smallest value in the column.
- First quarter (25%): the value that separates the smallest 25% of the data.
- Median (50%): the value that separates the lower half of the data from the upper half (the same arithmetic mean in the case of a symmetric distribution).
- Third quarter (75%): the value that separates the largest 25% of the data.
- Upper limit (max): the largest value in the column.

In [None]:
#Correlation Heatmap, this will help you understand relationships between numerical variables.
plt.figure(figsize=(18, 16))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.show()

In [None]:
# Feature Importance Visualization
# Understand which features in the dataset are most important in determining the value of "price_range".
df.drop('price_range', axis=1).corrwith(df.price_range).plot(kind='bar', grid=True, figsize=(15, 5)
                                                             ,title="Correlation with price_range", color="Blue")

In [None]:
# Distribution of Features, Use histograms to check the distribution of numerical features.
df.hist(figsize=(25, 23))
plt.show()

In [None]:
# Pairplot for Feature Relationships, A pairplot can give insights into relationships between features.
plt.figure(figsize=(25, 23))
sns.pairplot(data = df)
plt.show()

In [18]:
def generate_distribution_plot(df_train, continuous_features):
    data = df_train[continuous_features].copy()

    # create subplots
    fig, axes = plt.subplots(nrows=len(data.columns)//2, ncols=2, figsize=(20, 40))
    fig.subplots_adjust(hspace=0.7)

    # set fontdict
    font = {'family': 'serif',
            'color': 'darkred',
            'weight': 'normal',
            'size': 16,
            }
    
    for ax, feature in zip(axes.flatten(), data.columns):
        feature_mean = data[feature].mean()
        feature_median = data[feature].median()
        feature_mode = data[feature].mode().values[0]
        sns.distplot(data[feature], ax=ax)
        ax.set_title(f'Analysis of {feature}', fontdict=font)
        ax.axvline(feature_mean, color='r', linestyle='--', label="Mean")
        ax.axvline(feature_median, color='g', linestyle='-', label="Median")
        ax.axvline(feature_mode, color='b', linestyle='-', label="Mean")
        ax.legend()
        
    plt.show()

In [None]:
continuous_features = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range']

generate_distribution_plot(df, continuous_features)

In [None]:
# Show boxplot for all columns
for i in df.columns:
    sns.boxplot(x=i, data = df)
    plt.show()

## 3. Data pre-processing

There is no null value which need to handle or values need to scale .. so we will go to splitting data

In [21]:
X = df.drop('price_range', axis=1)
y = df['price_range']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 4. Build the Model

**Random Forest** is an ensemble learning method that improves predictive accuracy by combining multiple decision trees. Each tree is trained on a random subset of data and features, and the final prediction is made by aggregating the results from all trees. This method reduces overfitting, handles large datasets well, and provides insights into feature importance.

***Key Points***:
- **Ensemble Method**: Combines predictions from multiple trees.
- **Decision Trees**: Individual trees are trained on different data subsets.
- **Bagging**: Reduces overfitting by using different samples of data.
- **Feature Randomness**: Uses a subset of features for each tree.

***Benefits***:
- **Enhanced Accuracy**: Reduces errors compared to single decision trees.
- **Versatile**: Suitable for both classification and regression tasks.
- **Feature Importance**: Highlights which features are most influential.

In [22]:
params = {
    'random_state': 42
}

In [None]:
# Train the Random Forest model
rf_model = RandomForestClassifier(**params)
rf_model

In [None]:
rf_model.fit(X_train, y_train)

In [25]:
# Make predictions
y_pred = rf_model.predict(X_test)

In [None]:
# Model evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [27]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'n_estimators': [100, 500, 1000],
    'max_depth': [None, 8, 16, 32],
    'min_samples_split': [2, 5, 10],
    'criterion': ['entropy', 'gini']
}

rf = RandomForestClassifier(**params)

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Evaluate on the test set
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Test set score:", test_score)

In [None]:
# Visualization of Confusion Matrix
ConfusionMatrixDisplay.from_predictions(y_test, y_pred,
                                        cmap = plt.cm.Blues,
                                        normalize=None,
                                        display_labels=['0', '1', '2', '3'])
plt.title('Confusion Matrix')
plt.show()

### Learning Curve

In [30]:
from sklearn.model_selection import learning_curve

# Learning curve calculation
train_sizes, train_scores, test_scores = learning_curve(RandomForestClassifier(random_state=42),
                                                        X, y, cv=5,
                                                        train_sizes=np.linspace(0.1, 1.0, 10),
                                                        scoring='accuracy', random_state=42)

In [31]:
# Calculate the average accuracy and standard deviation for each training step.
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

In [None]:
# Learning curve drawing
plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_mean, label="Train Accuracy", color="blue", marker='o')
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="blue", alpha=0.2)

plt.plot(train_sizes, test_mean, label="Test Accuracy", color="green", marker='o')
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="green", alpha=0.2)

plt.title('Learning Curve for Random Forest')
plt.xlabel('Training Set Size')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
plt.show()

## 5. Save Model

### A. Joblib

In [None]:
import joblib

# Save model
joblib.dump(rf_model, 'random_forest_model_joblib.pkl')

In [34]:
# load the model
loaded_model = joblib.load('random_forest_model_joblib.pkl')

### Pickle 

In [50]:
import pickle

# Save model using pickle
with open(PROJECT_PATH + 'models/mobile_phone_classification.pkl', 'wb') as file:
    pickle.dump(rf_model, file)

In [51]:
# Load model using pickle
with open(PROJECT_PATH + 'models/mobile_phone_classification.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [49]:
import pickle

# Save model using pickle with a different extension
with open('random_forest_model_pickle.model', 'wb') as file:
    pickle.dump(rf_model, file)

In [38]:
# Load model using pickle from a different extension
with open('random_forest_model_pickle.model', 'rb') as file:
    loaded_model = pickle.load(file)