In [7]:
import pandas as pd
import numpy as np
import re

In [8]:
laptop = pd.read_csv("../data/laptop.csv")
laptop.head()

Unnamed: 0,price,brand,model,processor,ram memory,display size,storage capacity,cpu cores,graphics card,graphics memory,rating,reviews,name
0,129900,apple,MacBook Air,Not Specified,8,13.3,256,,,,4.8,73,Apple MacBook Air 13-inch M1 256GB - Oliz Store
1,27999,lenovo,Ideapad 1,Celeron Dual core,4,14.0,128,Dual Core,,,3.3,3,Lenovo Ideapad 1 Intel Celeron N4020 – 4GB DDR...
2,64999,dell,Vostro 3520,Intel Core i5,16,15.6,512,Not Specified,,,4.5,13,Dell Vostro 3520 i5 12th Gen | 16GB RAM | 512G...
3,101999,acer,Nitro 5,Ryzen 7,16,15.6,512,,,,5.0,1,Acer Nitro 5 Ryzen 7 5800H 16GB Ram 512GB SSD ...
4,107999,hp,Victus 15,Intel Core i5,16,15.6,512,,,,4.6,10,HP Victus 15 i5 13th Gen 13420H / 16GB DDR4 RA...


In [None]:
print(laptop.shape)

In [None]:
laptop.info()

### Cleaning the 'processor' column

In [None]:
def clean_processor(processor):
    if "7" in processor:
        return "7"
    elif "5" in processor:
        return "5"
    elif "3" in processor:
        return "3"
    elif "9" in processor:
        return "9"
    elif "celeron" in processor.lower():
        return "2"
    elif "m1" in processor.lower():
        return "6.5"
    elif "m2" in processor.lower():
        return "7.5"
    else:
        return processor

laptop["processor"] = laptop["processor"].apply(clean_processor)
laptop['processor'].isnull().sum()
# laptop['processor'].value_counts()

### Cleaning the 'ram' column

In [None]:
laptop.loc[laptop["ram memory"] == "1", "ram memory"] = "16"
laptop.loc[laptop["ram memory"] == "3", "ram memory"] = "4"
laptop.loc[laptop["ram memory"] == "Not Specified", "ram memory"] = "8"
laptop.loc[laptop["ram memory"].isna(), "ram memory"] = "8"

### For Cleaning 'storage capacity' Column

In [None]:
laptop['storage capacity'].isnull().sum()
# laptop['storage capacity'].value_counts()

In [None]:
laptop['name'] = laptop['name'].astype(str)
numbers_to_check = {
    '256GB': '256',
    '64GB': '64',
    '512GB': '512',
    '1TB': '1024',
    '2TB': '2048',
}

def update_storage(row):
    if pd.isnull(row['storage capacity']):
        words = row['name'].split(' ')
        for word in words:
            if word in numbers_to_check:
                return numbers_to_check[word]
    return row['storage capacity']

initial_nan_indices = laptop[laptop['storage capacity'].isnull()].index

laptop['storage capacity'] = laptop.apply(update_storage, axis=1)

for index in initial_nan_indices:
    if not pd.isnull(laptop.at[index, 'storage capacity']):
        print(f"Index: {index}, Name: {laptop.at[index, 'name']}, Storage: {laptop.at[index, 'storage capacity']}")

### For Cleaning 'cpu cores' Column


In [None]:
laptop.loc[laptop['cpu cores'] == 'Quad Core', 'cpu cores'] = '4'
laptop.loc[laptop['cpu cores'] == 'Octa Core', 'cpu cores'] = '8'
laptop.loc[laptop['cpu cores'] == 'Hexa Core', 'cpu cores'] = '6'
laptop.loc[laptop['cpu cores'] == 'Single Core', 'cpu cores'] = '1'
laptop.loc[laptop['cpu cores'] == 'Dual Core', 'cpu cores'] = '2'
laptop.loc[laptop['cpu cores'] == 'Deca Core', 'cpu cores'] = '10'
laptop.loc[laptop['cpu cores'] == 'Dodeca Core', 'cpu cores'] = '12'
laptop.loc[laptop['cpu cores'] == 'Not Specified', 'cpu cores'] = np.nan


laptop['cpu cores'].value_counts()
# laptop['cpu cores'].isnull().sum()

In [None]:
laptop['cpu cores'] = laptop['cpu cores'].apply(lambda x: int(float(x)))
laptop['cpu cores'].isnull().sum()
# laptop.to_csv("../laptop.csv", index=False)

In [None]:
def get_cpu_cores(processor):
    if processor == 3.0:
        return 4
    elif processor == 5.0:
        return 6
    elif processor == 7:
        return 8
    elif processor == 9:
        return 10
    elif processor == 2:
        return 2
    else:
        return None

laptop.loc[laptop['cpu cores'].isnull(), 'cpu cores'] = laptop.loc[laptop['cpu cores'].isnull(), 'processor'].apply(get_cpu_cores)



In [None]:
def clean_text(text):
    if isinstance(text, str):  # Check if the text is a string
        # Define the pattern to match unwanted characters
        pattern = r'[\/,.\(\):;\-]'
        # Use re.sub() to replace unwanted characters with an empty string
        cleaned_text = re.sub(pattern, '', text)
        return cleaned_text
    return text  
df2['details'] = df2['details'].astype(str)
df2['details'] = df2['details'].apply(clean_text)


In [None]:
# # Clean and standardize the 'details' column
df2['details'] = df2['details'].str.replace(r'(\d+)\s+cores', r'\1cores', regex=True)
df2['details'] = df2['details'].str.replace(r'(\d+)\s+Cores', r'\1Cores', regex=True)

# Iterate over the DataFrame rows
for index, row in df2.iterrows():
    # Check for null or "Not Specified" in 'details' column
    if pd.isnull(row['details']):
        continue

    # Use regex to find the number of cores
    match = re.search(r'(\d+)[-]?[cC]ore[sS]', row['details'])
    
    if match:
        cores = int(match.group(1))
        df2.loc[index, 'cpu cores'] = cores
        print("Row:", index)
        print("Cores:", df2.loc[index, 'cpu cores'])
        print("Detail:", row['details'])

# Display the updated DataFrame
print(df2)


### For Cleaning 'graphics card' Column


In [None]:
# laptop.loc[laptop['graphics card'] == 1500.0, 'graphics card'] = 1000.0

In [None]:
# laptop.loc[laptop['graphics card'] == 0.0, 'graphics card'] = np.nan
laptop['graphics card'] = laptop['graphics card'].apply(lambda x: int(x) if pd.notna(x) else np.nan)
# laptop['graphics card'].value_counts()
laptop['graphics card'].isnull().sum()


In [None]:
laptop = pd.read_csv('final_dataset.csv')
laptop['name'] = laptop['name'].astype(str)
numbers_to_check = {
    '1050': '1050',
    'UHD': '1000',
    'MX550': '1500',
    'MX350': '1300',
    'MX330': '1300',
    'MX230': '1200',
    'MX450': '1400',
    '2050': '2050',
    '2070': '2070',
    '1650ti': '1650',
    'Iris': '2050',
    '3050TI': '3050',
    '3060': '3060',
    '4050': '4050',
    '4060': '4060',
    '4070': '4070',
    'Radeon': '2000',
    '1660': '1660',
    '3070': '3070',
    'M1': '2500',
    'M2': '2800',
    
    'M3': '2900',

    'Vostro': '1000',
    '2060': '2060',
    '2040': '2040',
    '3070': '3070',
    '3070TI': '3070'
}

def update_graphics(row):
    if pd.isnull(row['graphics card']):
        words = row['name'].split(' ')
        for word in words:
            if word in numbers_to_check:
                return numbers_to_check[word]
    return row['graphics card']

initial_nan_indices = laptop[laptop['graphics card'].isnull()].index

laptop['graphics card'] = laptop.apply(update_graphics, axis=1)

for index in initial_nan_indices:
    if not pd.isnull(laptop.at[index, 'graphics card']):
        print(f"Index: {index}, Name: {laptop.at[index, 'name']}, Graphics Card: {laptop.at[index, 'graphics card']}")


temp = laptop[['graphics card', 'name', 'url']]
# temp.to_csv('temp.csv', index=False)


In [None]:
laptop['graphics card'] = laptop['graphics card'].fillna("1000.0")
laptop['graphics card'].isnull().sum()
laptop['graphics card'] = laptop['graphics card'].apply(lambda x: float(x))
laptop['graphics card'] = laptop['graphics card'].apply(lambda x: int(x))

In [None]:
temp = pd.read_csv('temp.csv')
laptop['graphics card'] = temp['graphics card']
# laptop.to_csv("laptop.csv", index=False)
# df2.to_csv('corecolumns.csv', index=False)