In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
for dirname, _, filenames in os.walk('/kaggle/input/mobile-uncleaned-data-set-scrapped-real-website'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Data Cleaning

In [None]:
df = pd.read_csv('/kaggle/input/mobile-uncleaned-data-set-scrapped-real-website/mobile.csv')

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
q1 = np.percentile(df['Spec Score'], 25)
q3 = np.percentile(df['Spec Score'], 75)
iqr = q3-q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

outliers_list = [x for x in df['Spec Score'] if x < lower_bound or x > upper_bound]
print(f"\nOutliers (IQR method - List):\n{outliers_list}")

In [None]:
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
print(numeric_cols)

In [None]:
num_plots = len(numeric_cols)
num_cols = 3
num_rows = (num_plots+num_cols-1)

plt.figure(figsize=(num_rows*4, num_cols *10))
for i, col in enumerate(numeric_cols):
    plt.subplot(num_rows, num_cols, i+1)
    sns.boxplot(y=df[col])
    plt.title(f'Outliers {col}')
    plt.ylabel(col)
    plt.grid(True, linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

In [None]:
df[df['Spec Score'].isin(outliers_list)]

In [None]:
print(f'Upper Bound: {upper_bound}')
print(f'Lower Bound: {lower_bound}')
print(f'IQR: {iqr}')
print(f'Quartile 1: {q1}')
print(f'Quartile 3: {q3}')

In [None]:
df['Spec Score'] = np.where(df['Spec Score'] < lower_bound, lower_bound, df['Spec Score'])

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(y=df['Spec Score'])
plt.title('Spec Score Without Outliers')
plt.ylabel('Spec Score')
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()

In [None]:
df.isnull().sum()

In [None]:
df.drop(columns=['fm'], inplace=True)

In [None]:
df['storage'] = df['storage'].fillna('Unknown')
df['processor'] = df['processor'].fillna('Unknown')
df['memoryExternal'] = df['memoryExternal'].fillna('Unknown')
df['battery'] = df['battery'].fillna('Unknown') 
df['display'] = df['display'].fillna('Unknown')
df['camera'] = df['camera'].fillna('Unknown')
df['version'] = df['version'].fillna('Unknown') 

In [None]:
df.isnull().sum()

## Exploratory Data Analysis

Since we're gonna do a prediction model, let's take a look for the correlation between each variable

In [None]:
plt.figure(figsize=(12,7))
sns.scatterplot(df, x=np.log10(df['price']+1), y='Spec Score')
plt.title('Correlation Between Price and Spec Score')
plt.xlabel('Log10 Price')
plt.ylabel('Spec Score')
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()

We can observe from the following picture that there is a positive and generally linear relationship between Spec Score and Price.  In order to reduce the accumulation of the lower data and make it appear clear and instructive, I also perform a logarithmic modification. The top-left "ribbon" patterns indicate that a higher spec-score is always associated with a higher price.  Additionally, some pricing clusters with more constrained spec score varieties indicate distinct market sectors (e.g., entry-level, mid-range, and high-end).  These dots on the bottom left provide confirmation of the detected outliers, indicating that the product is being offered with fewer specifications, potentially resulting in a lower price.

In [None]:
top_n_processors = df['processor'].value_counts().head(10)
plt.figure(figsize=(12,7))
sns.barplot(x=top_n_processors.values, y=top_n_processors.index, palette='viridis')
plt.title('Distribution of Top 10 Processor Types')
plt.xlabel('Number of Phone')
plt.ylabel('Processor Type')
plt.show()

In [None]:
tag_counts = df['tag'].value_counts()
plt.figure(figsize=(12,7))
sns.barplot(data=df, x=tag_counts.index, y=tag_counts.values)
plt.title('Tag Count')
plt.xlabel('Tag')
plt.ylabel('Number of Tag')
plt.show()

## Feature Engineering

In [None]:
import re

In [None]:
def extract_value_from_text(text, pattern):
    match = re.search(pattern, text, re.IGNORECASE)
    return float(match.group(1)) if match else np.nan

In [None]:
df['Battery_mAh'] = df['battery'].apply(lambda x: extract_value_from_text(x, r'(\d+)\s*mAh'))
df['Display_Inches'] = df['display'].apply(lambda x: extract_value_from_text(x, r'(\d+\.?\d*)\s*(?:inch|inches|")'))
df['Camera_MP'] = df['camera'].apply(lambda x: extract_value_from_text(x, r'(\d+)\s*MP'))

In [None]:
def extract_ram_rom(storage_str):
    storage_str = storage_str.lower()
    ram = np.nan
    rom = np.nan

    ram_match = re.search(r'(\d+)\s*gb\s*ram', storage_str)
    if ram_match: ram = int(ram_match.group(1))
    
    rom_match = re.search(r'(\d+)\s*gb(?:(?:\s*inbuilt)|(?:\s*storage))', storage_str)
    if rom_match: rom = int(rom_match.group(1))
    else:
        single_gb_match = re.search(r'(\d+)\s*gb$', storage_str)
        if single_gb_match and (pd.isna(ram) or int(single_gb_match.group(1)) != ram):
             rom = int(single_gb_match.group(1))
        
    return pd.Series({'RAM_GB': ram, 'Internal_Storage_GB': rom})

df_storage_parsed = df['storage'].apply(extract_ram_rom)
df = pd.concat([df, df_storage_parsed], axis=1)


In [None]:
for col in ['Battery_mAh', 'Display_Inches', 'Camera_MP', 'RAM_GB', 'Internal_Storage_GB']:
    median_val = df[col].median()
    if pd.isna(median_val):
        df[col].fillna(0, inplace=True)
    else:
        df[col].fillna(median_val, inplace=True)

In [None]:
def extract_processor_brand(processor_name):
    processor_name = processor_name.lower()
    if 'snapdragon' in processor_name: return 'Snapdragon'
    elif 'dimensity' in processor_name: return 'Dimensity'
    elif 'helio' in processor_name: return 'Helio'
    elif 'exynos' in processor_name: return 'Exynos'
    elif 'a series' in processor_name or 'apple' in processor_name or 'bionic' in processor_name: return 'Apple A Series'
    elif 'kirin' in processor_name: return 'Kirin'
    elif 'mediatek' in processor_name: return 'MediaTek (General)'
    elif 'unisoc' in processor_name: return 'Unisoc'
    elif 'intel' in processor_name: return 'Intel'
    else: return 'Other/Unknown Processor Brand'

In [None]:
df['Processor_Brand'] = df['processor'].apply(extract_processor_brand)

In [None]:
def extract_main_version(version_str):
    version_str = version_str.lower()
    match_android = re.search(r'android\s*(\d+)', version_str)
    match_ios = re.search(r'ios\s*(\d+)', version_str)
    
    if match_android: return 'Android ' + match_android.group(1)
    elif match_ios: return 'iOS ' + match_ios.group(1)
    else: return 'Other/Unknown Version'

df['Version_Main'] = df['version'].apply(extract_main_version)

In [None]:
for col in ['sim', 'memoryExternal', 'Processor_Brand', 'Version_Main']:
    df[col].fillna('Unknown')

## Feature Selection

In [None]:
numerical_features = ['Spec Score', 'rating', 'Battery_mAh', 'Display_Inches', 'Camera_MP', 'RAM_GB', 'Internal_Storage_GB']
cat_features = ['tag', 'sim', 'memoryExternal', 'Processor_Brand', 'Version_Main']
target = 'price'

In [None]:
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df['Spec Score'] = pd.to_numeric(df['Spec Score'], errors='coerce')
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

In [None]:
X = df[numerical_features+cat_features]
y = df[target]

## Modelling

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print("--- Ukuran Data Setelah Pembagian ---")
print(f"Ukuran X_train: {X_train.shape}")
print(f"Ukuran X_test: {X_test.shape}")
print(f"Ukuran y_train: {y_train.shape}")
print(f"Ukuran y_test: {y_test.shape}")

## Preprocessing Pipelines

In [None]:
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [None]:
cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, numerical_features),
    ('cat', cat_pipeline, cat_features)
])

## Pipelines Model

In [None]:
pipeline_lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [None]:
pipeline_xgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor(random_state=42))
])

## Training Model

In [None]:
def evaluate_model(pipeline, name, position):
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(f"\n=== {name} ===")
    print("MAE :", mean_absolute_error(y_test, y_pred))
    print("MSE :", mean_squared_error(y_test, y_pred))
    print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
    print("R2  :", r2_score(y_test, y_pred))

    plt.subplot(1, 2, position)
    sns.scatterplot(x=y_test, y=y_pred)
    plt.xlabel("Actual Price")
    plt.ylabel("Predicted Price")
    plt.title(f"{name}")
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
    plt.grid(True)

    return pipeline, y_pred

In [None]:
plt.figure(figsize=(12, 6)) 
model_lr, pred_lr = evaluate_model(pipeline_lr, "Linear Regression", position=1)
model_xgb, pred_xgb = evaluate_model(pipeline_xgb, "XGBoost Regressor", position=2)
plt.tight_layout()
plt.show()