In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
# Load Dataset
df = pd.read_csv('data\mobiledata.csv')

In [None]:
# Basic Cleaning
df.drop('sim', axis=1, inplace=True)

In [None]:
# Handle Missing Ratings
df['reting'] = df['reting'].fillna(
    df.groupby('brand')['reting'].transform('median')
)
df['reting'].fillna(7, inplace=True)


In [None]:
# OS & Memory Card Cleaning
df['os'] = df['os'].str.split(' ').str.get(0)

df['card'].replace('Memory Card Not Supported', 0, inplace=True)
df.loc[df['card'] != 0, 'card'] = 1

In [None]:
# Display Feature Cleaning
df['display_size'] = (
    df['display']
    .str.split(',').str.get(0)
    .str.split(' ').str.get(0)
    .astype(float)
)

df['refresh_rate'] = (
    df['display']
    .str.split(',').str.get(-1)
    .str.split('Hz').str.get(0)
    .str.strip()
)

df.loc[df['refresh_rate'].str.contains('Display', na=False), 'refresh_rate'] = 60
df['refresh_rate'] = df['refresh_rate'].fillna(60).astype(int)


In [None]:
# Processor Brand Cleaning
df['processor_brand'] = df['processor_brand'].replace({
    'Dimensit': 'Dimensity',
    'Octa': 'Other',
    'Apple': 'Bionic',
    'Google': 'Tensor',
    'UNISOC': 'Unisoc'
})

In [None]:
# Convert Boolean Flags to Numeric
df['is_5g'] = df['is_5g'].astype('int32')
df['is_nfc'] = df['is_nfc'].astype('int32')
df['is_ir_blaster'] = df['is_ir_blaster'].astype('int32')
df['fast_charge'] = df['fast_charge'].astype('int32')


In [None]:
# Camera Cleaning
df['rear_mp'] = df['rear_mp'].astype('int64')

df['front_mp'].replace(['10.5','10.8','11.1'], '11', inplace=True)
df.loc[df['front_mp'] == 'Main', 'front_mp'] = '0'
df['front_mp'] = df['front_mp'].astype(int)


In [None]:
# Processor Core Cleaning
df.loc[df['core'].str.contains('Octa', na=False), 'core'] = '8'
df.loc[df['core'].str.contains('Hexa', na=False), 'core'] = '6'
df.loc[df['core'].str.contains('Nine', na=False), 'core'] = '9'
df.loc[df['core'].str.contains('Deca', na=False), 'core'] = '10'

df['core'] = df['core'].replace('2.60', '2')
df = df[df['core'] != '2']
df['core'] = df['core'].astype('int32')

In [None]:
# EDA: Price Distribution
df['price'].hist(bins=30)
plt.show()

In [None]:
# EDA: RAM vs Price
plt.figure(figsize=(8,5))
plt.scatter(df['ram'], df['price'], alpha=0.5)
plt.xlabel('RAM (GB)')
plt.ylabel('Price')
plt.show()

In [None]:
# EDA: Battery vs Average Price
df.groupby('battery_size')['price'].mean().plot(kind='line', marker='o')
plt.xlabel('Battery (mAh)')
plt.ylabel('Average Price')
plt.grid(True)
plt.show()

In [None]:
# EDA: Price Distribution by RAM
df.boxplot(column='price', by='ram')
plt.suptitle('')
plt.show()

In [None]:
# Correlation Check
corr = df['ram'].corr(df['price'])
print("Correlation between RAM and Price:", corr)

In [None]:
# Normality Check (QQ Plot)
stats.probplot(df['price'], dist='norm', plot=plt)
plt.show()


In [None]:
# Processor Brand vs Average Price
processor_brand_price = df.groupby('processor_brand')['price'].mean().sort_values()
processor_brand_price.plot(kind='line', marker='o')
plt.xlabel('Processor Brand')
plt.ylabel('Average Price')
plt.grid(True)
plt.show()

## ðŸ“˜ Notebook:
This notebook focuses on **final data cleaning and exploratory data analysis (EDA)** before building the machine learning model.

It works on the dataset `mobiledata.csv`, which is generated from the data preparation step.

### What is done in this notebook

- Drops unnecessary columns not required for analysis  
- Handles missing values, especially in the rating column  
- Cleans and standardizes:
  - Operating system values  
  - Processor brand names  
  - Core count representations  
  - Camera megapixel values  
- Converts boolean features like 5G, NFC, IR Blaster, and fast charging into numeric form  
- Ensures data types are consistent and usable  

### Exploratory Data Analysis (EDA)

The notebook performs basic EDA to understand data behavior, including:

- Price distribution analysis  
- Relationship between RAM and price  
- Average price comparison across battery sizes  
- Price distribution across different RAM variants  
- Correlation check between RAM and price  
- Normality check using QQ plot  
- Price comparison across processor brands  

### Purpose

- To understand the dataset before modeling  
- To identify patterns and relationships in mobile pricing  
- To ensure data quality and consistency  
- To keep EDA and modeling clearly separated  

### Note

This notebook does **not** include any machine learning models.  
It is used only for **cleaning, validation, and data understanding** before training.
Some Code in `model_training.ipynb` is repeated.

