In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
# Load Cleaned Dataset
df = pd.read_csv('data\mobiledata.csv')

In [None]:
# Drop Unused Column
df.drop('sim', axis=1, inplace=True)

In [None]:
# Rating Handling
df['reting'] = df['reting'].fillna(
    df.groupby('brand')['reting'].transform('median')
)
df['reting'].fillna(7, inplace=True)

In [None]:
# OS Cleaning
df['os'] = df['os'].str.split(' ').str.get(0)

In [None]:
# Memory Card Binary Encoding
df['card'].replace('Memory Card Not Supported', 0, inplace=True)
temp = df[df['card'] != 0]
temp['card'] = 1
df[df['card'] != 0] = temp

In [None]:
# Display Feature Cleaning
df['display_size'] = df['display'].str.split(',').str.get(0).str.split(' ').str.get(0)
df['display_size'] = df['display_size'].astype(float)

df['refresh_rate'] = df['display'].str.split(',').str.get(-1).str.split('Hz').str.get(0)
temp = df[df['refresh_rate'].str.contains('Display')]
temp['refresh_rate'] = 60
df[df['refresh_rate'].str.contains('Display')] = temp

In [None]:
# Processor Brand Fixes
df['processor_brand'] = df['processor_brand'].replace('Dimensit','Dimensity')
df['processor_brand'].replace('Octa','Other', inplace=True)
df['processor_brand'].replace('Apple','Bionic', inplace=True)
df['processor_brand'].replace('Google','Tensor', inplace=True)
df['processor_brand'].replace('UNISOC','Unisoc', inplace=True)

In [None]:
# Boolean Columns to Integer
df['is_5g'] = df['is_5g'].astype('int32')
df['is_nfc'] = df['is_nfc'].astype('int32')
df['is_ir_blaster'] = df['is_ir_blaster'].astype('int32')
df['fast_charge'] = df['fast_charge'].astype('int32')

In [None]:
# Camera Cleaning
df['rear_mp'] = df['rear_mp'].astype('int64')

df['front_mp'].replace('10.5','11', inplace=True)
df['front_mp'].replace('10.8','11', inplace=True)
df['front_mp'].replace('11.1','11', inplace=True)

temp = df[df['front_mp'] == 'Main']
temp['front_mp'] = '0'
df[df['front_mp'] == 'Main'] = temp

df['front_mp'] = df['front_mp'].astype('int')

In [None]:
# Select Final Features
data = df[['brand','price','reting','is_5g','is_nfc','is_ir_blaster',
           'processor_brand','core','proccessor_speed','ram',
           'internal_memory','battery_size','fast_charge',
           'charging_speed','rear_mp','front_mp',
           'os','display_size','refresh_rate']]

In [None]:
# Core Cleaning
temp = data[data['core'].str.contains('Octa')]
temp['core'] = '8'
data[data['core'].str.contains('Octa')] = temp

temp = data[data['core'].str.contains('Hexa')]
temp['core'] = '6'
data[data['core'].str.contains('Hexa')] = temp

temp = data[data['core'].str.contains('Nine')]
temp['core'] = '9'
data[data['core'].str.contains('Nine')] = temp

temp = data[data['core'].str.contains('Deca')]
temp['core'] = '10'
data[data['core'].str.contains('Deca')] = temp

data['core'] = data['core'].replace('2.60','2')
data = data[data['core'] != '2']
data['core'] = data['core'].astype('int32')

In [None]:
# One Hot Encoding - Processor Brand
ohe = OneHotEncoder(sparse_output=False)
encoded = ohe.fit_transform(data[['processor_brand']])

encoded_df = pd.DataFrame(
    encoded,
    columns=ohe.get_feature_names_out(['processor_brand']),
    index=data.index
)

data = pd.concat([data, encoded_df], axis=1)
data.drop('processor_brand', axis=1, inplace=True)


In [None]:
# One Hot Encoding - Brand
ohe1 = OneHotEncoder(sparse_output=False)
encoded = ohe1.fit_transform(data[['brand']])

encoded_df = pd.DataFrame(
    encoded,
    columns=ohe1.get_feature_names_out(['brand']),
    index=data.index
)

data = pd.concat([data, encoded_df], axis=1)
data.drop('brand', axis=1, inplace=True)


In [None]:
# One Hot Encoding - OS
ohe2 = OneHotEncoder(sparse_output=False)
encoded = ohe2.fit_transform(data[['os']])

encoded_df = pd.DataFrame(
    encoded,
    columns=ohe2.get_feature_names_out(['os']),
    index=data.index
)

data = pd.concat([data, encoded_df], axis=1)
data.drop('os', axis=1, inplace=True)


In [None]:
# Train Test Split
X = data.iloc[:,1:]
y = data.iloc[:,0]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2
)

In [None]:
# Model Training
lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
# Prediction
y_pred = lr.predict(X_test)

In [None]:
# Model Evaluation
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("RMSE:", rmse)
print("R2:", r2)

## ðŸ“˜ Notebook:
This notebook is used to train a baseline machine learning model for mobile price prediction.

It works on the already cleaned dataset `mobiledata.csv`.

### What is done in this notebook

- Missing rating values are filled using brand-wise median  
- Some columns are cleaned and standardized (OS, processor brand, cores, camera values)  
- Boolean features like 5G, NFC, fast charging are converted to numeric form  
- Only the final required features are selected for modeling  

### Encoding

One-Hot Encoding is applied to:
- Brand  
- Processor brand  
- Operating system  

### Model

- Linear Regression is used as a baseline model  
- Data is split into 80% training and 20% testing  

### Evaluation

Model performance is measured using:
- Mean Squared Error (MSE)  
- Root Mean Squared Error (RMSE)  
- RÂ² score  

### Purpose

- This notebook is kept simple on purpose  
- It helps understand how features affect price  
- It acts as a baseline before adding optimizations later  
