In [3]:
import pandas as pd

df = pd.read_csv("flipkart_mobile_phones.csv")

In [4]:
df.info()
df.isnull().sum()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           600 non-null    int64  
 1   Product Name         600 non-null    object 
 2   Sale & Actual Price  600 non-null    object 
 3   Raview & Rattings    599 non-null    object 
 4   Offer                600 non-null    object 
 5   Camera               600 non-null    object 
 6   Memory               600 non-null    object 
 7   Battery              600 non-null    object 
 8   Display              600 non-null    object 
 9   Ratting Number       599 non-null    float64
dtypes: float64(1), int64(1), object(8)
memory usage: 47.0+ KB


Unnamed: 0.1,Unnamed: 0,Ratting Number
count,600.0,599.0
mean,299.5,4.289316
std,173.349358,0.2441
min,0.0,3.0
25%,149.75,4.2
50%,299.5,4.3
75%,449.25,4.4
max,599.0,4.8


In [5]:
df['Sale & Actual Price'] = df['Sale & Actual Price'].astype(str)

In [6]:
# Selling Price (first ₹ value)
df['selling_price'] = df['Sale & Actual Price'].str.extract(r'₹([\d,]+)')

# Actual Price (second ₹ value)
df['actual_price'] = df['Sale & Actual Price'].str.extract(r'₹[\d,]+₹([\d,]+)')

# Discount Percentage
df['discount_price'] = df['Sale & Actual Price'].str.extract(r'(\d+)%')

In [7]:
df[['selling_price', 'actual_price', 'discount_price']].head()


Unnamed: 0,selling_price,actual_price,discount_price
0,18999,3399944,99944
1,2472,299917,99917
2,18999,3399944,99944
3,6499,799918,99918
4,8499,1099922,99922


In [8]:
df.columns = (
    df.columns
      .str.strip()
      .str.lower()
      .str.replace('&', 'and')
      .str.replace(' ', '_')
)


In [9]:
df['raview_and_rattings'] = df['raview_and_rattings'].astype(str)


In [10]:
df['rattings'] = df['raview_and_rattings'].str.extract(r'([\d,]+)\s*Ratings')
df['reviews'] = df['raview_and_rattings'].str.extract(r'([\d,]+)\s*Reviews')

In [11]:
df.drop(columns=['raview_and_rattings'], inplace=True)

In [12]:
df.drop(columns=['sale_and_actual_price'], inplace=True)

In [13]:
df.rename(columns={'offer': 'discount_percentage'}, inplace=True)


In [14]:
df['discount_percentage'] = (
    df['discount_percentage']
    .astype(str)
    .str.lower()
    .str.replace('off', '', regex=False)
    .str.replace('%', '', regex=False)
    .str.strip()
)

df['discount_percentage'] = pd.to_numeric(df['discount_percentage'], errors='coerce')


In [15]:
df['discount_percentage'].head()

0    44.0
1    17.0
2    44.0
3    18.0
4    22.0
Name: discount_percentage, dtype: float64

In [16]:
df['battery'] = (
    df['battery']
    .astype(str)
    .str.lower()
    .str.replace('battery', '', regex=False)
    .str.strip()
)

In [17]:
df['battery'].head()


0    5000 mah
1    2000 mah
2    5000 mah
3    5000 mah
4    5000 mah
Name: battery, dtype: object

In [18]:
df['selling_price'] = (
    df['selling_price']
    .astype(str)
    .str.replace(',', '', regex=True)
)

df['selling_price'] = pd.to_numeric(df['selling_price'], errors='coerce')


In [19]:
df['selling_price'].isna().sum()


np.int64(0)

In [20]:
def price_band(price):
    if price < 10000:
        return 'Budget'
    elif price < 20000:
        return 'Mid-Range'
    else:
        return 'Premium'

df['Price_Band'] = df['selling_price'].apply(price_band)

In [21]:
df[['selling_price','Price_Band']].head()

Unnamed: 0,selling_price,Price_Band
0,18999,Mid-Range
1,2472,Budget
2,18999,Mid-Range
3,6499,Budget
4,8499,Budget


In [22]:
df.drop(columns=['discount_price'], inplace=True)

In [23]:
df['display'] = (
    df['display']
    .astype(str)
    .str.lower()
    .str.replace('display', '', regex=False)
    .str.strip()
)

In [24]:
df['display'].head()

0    16.76 cm (6.6 inch) full hd+
1               6.1 cm (2.4 inch)
2    16.76 cm (6.6 inch) full hd+
3       17.13 cm (6.745 inch) hd+
4       17.13 cm (6.745 inch) hd+
Name: display, dtype: object

In [25]:
df['camera'] = (
    df['camera']
    .astype(str)
    .str.lower()
    .str.replace('camera', '', regex=False)
    .str.strip()
)

In [26]:
df['camera'].head()

0    50mp + 8mp + 5mp | 13mp front
1                       0.5mp rear
2    50mp + 8mp + 5mp | 13mp front
3           50mp rear  | 5mp front
4           50mp rear  | 5mp front
Name: camera, dtype: object

In [27]:
df['memory'] = df['memory'].astype(str).str.lower()


In [28]:
df[['expand_value', 'expand_unit']] = df['memory'].str.extract(
    r'expandable\s*upto\s*(\d+)\s*(tb|gb)'
)


In [29]:
df['expand_value'] = pd.to_numeric(df['expand_value'], errors='coerce')

df['Expandable'] = df.apply(
    lambda row: row['expand_value'] * 1024 if row['expand_unit'] == 'tb'
    else row['expand_value'],
    axis=1
)


In [30]:
df['Expandable'] = df['Expandable'].fillna(0)


In [31]:
df['RAM'] = pd.to_numeric(
    df['memory'].str.extract(r'(\d+)\s*gb\s*ram')[0],
    errors='coerce'
).fillna(0)

df['ROM'] = pd.to_numeric(
    df['memory'].str.extract(r'(\d+)\s*gb\s*rom')[0],
    errors='coerce'
).fillna(0)


In [32]:
df.drop(columns=['expand_value', 'expand_unit', 'memory'], inplace=True)


In [33]:
df[['RAM', 'ROM', 'Expandable']].head()


Unnamed: 0,RAM,ROM,Expandable
0,8.0,128.0,1024.0
1,0.0,4.0,0.0
2,8.0,128.0,1024.0
3,4.0,64.0,1024.0
4,6.0,128.0,1024.0


In [34]:
df['RAM'] = df['RAM'].astype(int).astype(str) + 'gb'
df['ROM'] = df['ROM'].astype(int).astype(str) + 'gb'
df['Expandable'] = df['Expandable'].astype(int).astype(str) + 'gb'


In [35]:
for col in ['RAM', 'ROM', 'Expandable']:
    df[col] = df[col].replace('nangb', '0gb')


In [36]:
df[['RAM', 'ROM', 'Expandable']].head()


Unnamed: 0,RAM,ROM,Expandable
0,8gb,128gb,1024gb
1,0gb,4gb,0gb
2,8gb,128gb,1024gb
3,4gb,64gb,1024gb
4,6gb,128gb,1024gb


In [37]:
def gb_to_tb(value):
    value = str(value).lower().strip()
    
    if value.endswith('gb'):
        num = int(value.replace('gb', ''))
        if num >= 1024:
            return f"{num // 1024}tb"
        else:
            return f"{num}gb"
    return value

df['Expandable'] = df['Expandable'].apply(gb_to_tb)


In [38]:
df['Expandable'].head()


0    1tb
1    0gb
2    1tb
3    1tb
4    1tb
Name: Expandable, dtype: object

In [59]:
df['reviews'] = (
    df['reviews']
    .astype(str)
    .str.replace(',', '', regex=True)
)

df['reviews'] = pd.to_numeric(df['reviews'], errors='coerce')

#-- str format----


In [60]:
df['ratting_number'] = (
    df['ratting_number']
    .astype(str)
    .str.extract(r'(\d+\.?\d*)')[0]
)

df['ratting_number'] = pd.to_numeric(df['ratting_number'], errors='coerce')

#----str format---


In [61]:
df['selling_price'] = (
    df['selling_price']
    .astype(str)
    .str.replace(',', '', regex=True)
)

df['selling_price'] = pd.to_numeric(df['selling_price'], errors='coerce')
#----make as str format---


In [57]:
df[['selling_price', 'ratting_number', 'reviews']] = (
    df[['selling_price', 'ratting_number', 'reviews']]
    .fillna(0)
)
#----clean missing values---

In [62]:
df.to_csv("cl_flipkart_data.csv", index=False)