## Importing Libraries / Load Data

In [89]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

# Load the dataset
dataset = pd.read_csv('laptops (2).csv')

## Initial Data Exploration

In [90]:
# Initial dataset exploration
print("Initial Data:")
print(dataset.head())
print("\nDescriptive Statistics:")
print(dataset.describe())
print("\nNull Values Count in Each Column:")
print(dataset.isnull().sum())

Initial Data:
     Price  Stars Number of Reviews  \
0  $149.99    4.2              1926   
1  $749.99    4.6               384   
2  $429.99    4.7               693   
3  $649.99    4.5              1810   
4  $149.00    4.4               266   

                                        Product Name   Brand  Year of Release  \
0  14" Laptop - Intel Celeron - 4GB Memory - 64GB...      HP           2022.0   
1  ENVY 2-in-1 14" Full HD Touch-Screen Laptop - ...      HP           2023.0   
2  Ideapad 3i 15.6" FHD Touch Laptop - Core i5-11...  Lenovo           2022.0   
3  15.6" Touch-Screen Laptop - Intel Core i7 - 16...      HP           2021.0   
4  14" Chromebook - Intel Celeron - 4GB Memory - ...      HP           2022.0   

  Color Category Display Type  Screen Size      Screen Resolution  ...  \
0         Silver          LCD    14 inches        1366 x 768 (HD)  ...   
1         Silver          NaN    14 inches  1920 x 1080 (Full HD)  ...   
2           Gray          LCD  15.6 inches

## Data Cleaning - Dropping Irrelevant Columns

In [91]:
# Data Cleaning and Preprocessing

# Drop irrelevant columns
columns_to_drop = ['Battery Life (up to)', 'Voice Assistant Built-in', 'Display Type', 
                   'Number of CPU Cores', 'CPU Base Clock Frequency', 
                   'Front Facing Camera Video Resolution', 'Number of HDMI Outputs (Total)']
dataset.drop(columns=columns_to_drop, inplace=True)

## Handling Missing Values in Stars

In [92]:
# Drop rows with missing 'Stars' as it is a key variable
dataset.dropna(subset=['Stars'], inplace=True)

## Data Transformation - Cleaning Specific Columns

In [93]:
# Price cleaning: remove '$' and ',' then convert to float
dataset['Price'] = dataset['Price'].str.replace('$', '').str.replace(',', '').astype(float)

# Screen Size cleaning: remove 'inches' and convert to float
dataset['Screen Size'] = dataset['Screen Size'].str.replace('inches', '').astype(float)

  dataset['Price'] = dataset['Price'].str.replace('$', '').str.replace(',', '').astype(float)


## Operating System Encoding

In [94]:
# Operating System encoding
dataset['Operating System'] = dataset['Operating System'].apply(
    lambda x: 'mac' if 'mac' in str(x).lower() 
    else ('windows' if 'windows' in str(x).lower() 
    else ('chrome' if 'chrome' in str(x).lower() else x)))

# Drop rows with 'iOS' in Operating System as it is not relevant for laptops
dataset = dataset[~dataset['Operating System'].str.contains('iOS', case=False, na=False)]

## RAM Cleaning and Brand Grouping

In [95]:
# System Memory (RAM) cleaning: extract first number and convert to float
dataset['System Memory (RAM)'] = dataset['System Memory (RAM)'].astype(str)
dataset['System Memory (RAM)'] = dataset['System Memory (RAM)'].apply(
    lambda x: float(x.split()[0]) if len(x.split()) > 0 else x)

In [96]:

# Brand grouping function
def label_brand_group(brand):
    mainstream_brands = ['HP', 'Lenovo', 'Dell', 'Microsoft', 'Acer', 'Samsung', 'LG']
    gaming_brands = ['GIGABYTE', 'CORSAIR', 'MSI', 'Alienware', 'Razer', 'HP OMEN']
    apple_brands = ['Apple']
    if brand in mainstream_brands:
        return 'Mainstream'
    elif brand in gaming_brands:
        return 'Gaming'
    elif brand in apple_brands:
        return 'Apple'
    else:
        return 'Other'
    
# Apply brand grouping
dataset['Brand_Group'] = dataset['Brand'].apply(label_brand_group)

# Drop rows where Brand group is 'Other'
dataset = dataset[dataset['Brand_Group'] != 'Other']

## Final Dataset Preparation

In [97]:
# Select relevant columns for analysis
reviewdataset = dataset[['Stars', 'Brand_Group', 'Year of Release', 'Price', 'Screen Size', 
                         'Operating System', 'System Memory (RAM)', 'Backlit Keyboard']]

In [98]:
# Drop rows from 2012 to 2017 as they are not relevant for current analysis
reviewdataset = reviewdataset[(reviewdataset['Year of Release'] < 2012) | (reviewdataset['Year of Release'] > 2017)]

In [99]:
# Drop rows with any null values
reviewdataset.dropna(inplace=True)

In [100]:
# Encoding categorical variables using get_dummies
categorical_columns = ['Brand_Group', 'Operating System', 'Backlit Keyboard']

# Set drop first to false after deciding to stick with visualizations
reviewdataset = pd.get_dummies(reviewdataset, columns=categorical_columns, drop_first=False)

In [101]:
# export this dataset 

reviewdataset.to_csv('reviewdataset.csv', index=False)