In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import kaggle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.linear_model import LinearRegression, Lasso,Ridge,ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from statsmodels.stats.outliers_influence import variance_inflation_factor


import warnings
warnings.filterwarnings('ignore')


import joblib

In [None]:
kaggle.api.authenticate()
kaggle.api.dataset_download_files('mohannapd/mobile-price-prediction',unzip=True,path="./datasets")

In [None]:
data = pd.read_csv('datasets/Cellphone.csv')
data.head(3) # Display first 3 rows of the dataset

In [None]:
df = data.copy() # Create a copy of the dataset

In [None]:
df.columns # Display the columns of the dataset

In [None]:
df.drop(columns=['Product_id'], inplace=True) # Drop the 'Product_id' column as it is not needed for analysis

In [None]:
df.describe().T # Display the summary statistics of the dataset

In [None]:
df.info() # Display information about the dataset

In [None]:
df.duplicated().sum() # Check for duplicate rows in the dataset

In [None]:
num_col = len(df.columns)
cols_per_row = 3
rows = int(np.ceil(num_col / cols_per_row))

fig,axes = plt.subplots(nrows=rows, ncols=cols_per_row,figsize=(15,2*rows))

axes= axes.flatten()

for i,col in enumerate(df.columns):
    sns.boxplot(df[col],ax=axes[i],orient='h', palette='Set1')
    axes[i].set_title(col, fontsize=20)

for j in range(i+1, len(axes)):
    axes[j].axis('off')  # Hide unused subplots

plt.tight_layout()
plt.show()

In [None]:
num_col = len(df.columns)
cols_per_row = 3
rows = int(np.ceil(num_col / cols_per_row))

fig,axes = plt.subplots(nrows=rows, ncols=cols_per_row,figsize=(15,2*rows))

axes= axes.flatten()

for i,col in enumerate(df.columns):
    sns.boxplot(df[col],ax=axes[i],orient='h', palette='Set3')
    axes[i].set_title(col, fontsize=20)

for j in range(i+1, len(axes)):
    axes[j].axis('off')  # Hide unused subplots

plt.tight_layout()
plt.show()

In [None]:
skewed_cols=[]
for col in df.columns:
    if abs(df[col].skew()>1):
        skewed_cols.append(col)
print("Skewed Columns:", skewed_cols)

In [None]:
num_col = len(skewed_cols)
cols_per_row = 3
rows = int(np.ceil(num_col / cols_per_row))
fig,axes = plt.subplots(nrows=rows, ncols=cols_per_row,figsize=(20,5*rows))
axes= axes.flatten()

for i,col in enumerate(skewed_cols):
    sns.histplot(df[col],ax=axes[i], color='red',kde=1)
    axes[i].set_title(col, fontsize=20)

for j in range(i+1, len(axes)):
    axes[j].axis('off')

plt.tight_layout()
plt.show()

In [None]:
#apply log transformation to skewed columns
for col in skewed_cols:
    df[col] = np.log1p(df[col])  # log1p is used to handle zero values safely

In [None]:
num_col = len(skewed_cols)
cols_per_row = 3
rows = int(np.ceil(num_col / cols_per_row))
fig,axes = plt.subplots(nrows=rows, ncols=cols_per_row,figsize=(20,5*rows))
axes= axes.flatten()

for i,col in enumerate(skewed_cols):
    sns.histplot(df[col],ax=axes[i], color='lightgreen',kde=1)
    axes[i].set_title(col, fontsize=20)

for j in range(i+1, len(axes)):
    axes[j].axis('off')

plt.tight_layout()
plt.show()

In [None]:
features = df.drop(columns=['Price'])
target = df['Price']

num_col = len(features.columns)
cols_per_row = 3
rows = int(np.ceil(num_col / cols_per_row))
fig,axes = plt.subplots(nrows=rows, ncols=cols_per_row,figsize=(20,3*rows))
axes= axes.flatten()

for i,col in enumerate(features.columns):
    sns.regplot(data=df,x=col,y='Price',ax=axes[i], line_kws={'color': 'red'} )
    axes[i].set_title(col, fontsize=20)

for j in range(i+1, len(axes)):
    axes[j].axis('off')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap', fontsize=20)
plt.tight_layout()
plt.show()

In [None]:
def check_outliner(col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR    
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    return outliers

In [None]:
col_with_outliner = [col for col in df.columns if not check_outliner(col).empty]
num_col = len(col_with_outliner)
cols_per_row = 3
rows = int(np.ceil(num_col / cols_per_row))

fig,axes = plt.subplots(nrows=rows, ncols=cols_per_row,figsize=(15,2*rows))

axes= axes.flatten()

for i,col in enumerate(col_with_outliner):
    if not check_outliner(col).empty:
        sns.boxplot(df[col],ax=axes[i],orient='h', palette='Set3')
        axes[i].set_title(col, fontsize=20)

for j in range(i+1, len(axes)):
    axes[j].axis('off')  # Hide unused subplots

plt.tight_layout()
plt.show()

# For now, let's evaluate the models' performance including outliers

In [None]:
# 1. Select numeric features only
X = df.select_dtypes(include=['number'])

# 2. Create a DataFrame to store VIF values
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

# 3. Calculate VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# 4. Sort and display
vif_data = vif_data.sort_values(by="VIF", ascending=False)
print(vif_data)


In [None]:
# Assuming df is your DataFrame with features only (exclude target)
X = df.select_dtypes(include=[float, int])

# Standardize features before PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA (choose number of components, e.g., enough to explain 95% variance)
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)

print("Original number of features:", X.shape[1])
print("Reduced number of components:", X_pca.shape[1])

# Explained variance ratio of each component
print("Explained variance ratio:", pca.explained_variance_ratio_)

# If you want to convert PCA components back to a DataFrame
pca_df = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(X_pca.shape[1])])


# Model Training and Evaluation

In [None]:
# Split the dataset into features and target variable
x= df.drop(columns=['Price'])
y= df['Price']

In [None]:
#Train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.9, random_state=7)

print("Training set size:", x_train.shape[0])
print("Test set size:", x_test.shape[0])

In [None]:
#Standard Scaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)


# Models 

In [None]:
# Define models in a dictionary
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'ElasticNet Regression': ElasticNet()
}

# Empty list to collect results
results = []

# Fit and evaluate each model
for name, model in models.items():
    model.fit(x_train_scaled, y_train)
    
    # Predictions
    y_pred_train = model.predict(x_train_scaled)
    y_pred_test = model.predict(x_test_scaled)
    
    # Correct metrics
    train_mae = mean_absolute_error(y_train, y_pred_train)
    train_r2 = r2_score(y_train, y_pred_train)
    
    test_mae = mean_absolute_error(y_test, y_pred_test)
    test_r2 = r2_score(y_test, y_pred_test)
    
    # Append results
    results.append({
        'Model': name,
        'Train MAE': train_mae,
        'Train R2': train_r2,
        'Test MAE': test_mae,
        'Test R2': test_r2
    })

# Create a summary DataFrame
results_df = pd.DataFrame(results)

# Sort by Test R2 (descending)
results_df = results_df.sort_values(by='Test R2', ascending=False)

# Display
print(results_df)


# Based Upon the results, we can see that:
# - The Linear Regression model has the best performance on the test set with the lowest MSE and highest R2 score.
# - The Ridge Regression model performs well, but its MSE is slightly higher than that of the Linear Regression model.
# - The Lasso Regression model has a higher MSE compared to both Linear and Ridge Regression models, indicating that it may not be the best choice for this dataset.
# - The ElasticNet model also shows a higher MSE compared to Linear and Ridge Regression, indicating that it may not be the best choice for this dataset as well.   



In [None]:
random_model = RandomForestRegressor()
random_model.fit(x_train, y_train)

# Predictions
rf_y_pred_train = random_model.predict(x_train)
rf_y_pred_test = random_model.predict(x_test)

# Train Metrics
rf_train_mae = mean_absolute_error(y_train, rf_y_pred_train)
rf_train_r2 = r2_score(y_train, rf_y_pred_train)

print("Random Forest Train MAE:", rf_train_mae)
print("Random Forest Train R2:", rf_train_r2)

# Test Metrics
rf_test_mae = mean_absolute_error(y_test, rf_y_pred_test)
rf_test_r2 = r2_score(y_test, rf_y_pred_test)

print("Random Forest Test MAE:", rf_test_mae)
print("Random Forest Test R2:", rf_test_r2)


In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='r2',         
    verbose=2,
    n_jobs=-1             
)
grid_search.fit(x_train, y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best Score (CV R2):", grid_search.best_score_)
best_rf = grid_search.best_estimator_


In [None]:
# Predict on training data
y_pred_train = best_rf.predict(x_train)

# Calculate training MAE and R2
train_mae = mean_absolute_error(y_train, y_pred_train)
train_r2 = r2_score(y_train, y_pred_train)

# Predict on test data
y_pred_test = best_rf.predict(x_test)

# Calculate test MAE and R2
test_mae = mean_absolute_error(y_test, y_pred_test)
test_r2 = r2_score(y_test, y_pred_test)

# Print results
print("Best RF Train MAE:", train_mae)
print("Best RF Train R2:", train_r2)
print("Best RF Test MAE:", test_mae)
print("Best RF Test R2:", test_r2)


This suggests your Random Forest model is performing really well without obvious overfitting. The test performance is very close to training performance, which is a good sign.

In [None]:
# Predict on test data
y_pred_test = best_rf.predict(x_test)

# Calculate residuals
residuals = y_test - y_pred_test

# Plot residuals vs predicted values
plt.figure(figsize=(10,6))
plt.scatter(y_pred_test, residuals, alpha=0.6)
plt.axhline(y=0, color='red', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals (Actual - Predicted)')
plt.title('Residual Plot for Random Forest Regressor')
plt.show()


In [None]:
sns.kdeplot(residuals)
plt.show()

In [None]:
importances = best_rf.feature_importances_
feat_names = x.columns

plt.figure(figsize=(10, 6))
sns.barplot(x=importances, y=feat_names)
plt.title("Feature Importances - Random Forest")
plt.tight_layout()
plt.show()

In [None]:
#Exporting best Model 
joblib.dump(best_rf, 'best_rf.pkl')
joblib.dump(scaler, 'scaler.pkl')