In [452]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import pickle
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

%matplotlib inline

In [462]:
df = pd.read_csv("laptop_price.csv", encoding="latin-1")
df = df[["Company", "TypeName", "Inches", "ScreenResolution", "Cpu", "Ram", "Memory", "Gpu", "OpSys", "Weight", "Price_euros"]]

In [463]:
df.isnull().sum()

Company             0
TypeName            0
Inches              0
ScreenResolution    0
Cpu                 0
Ram                 0
Memory              0
Gpu                 0
OpSys               0
Weight              0
Price_euros         0
dtype: int64

In [367]:
df["Ram"].value_counts()

8GB     619
4GB     375
16GB    200
6GB      41
12GB     25
2GB      22
32GB     17
24GB      3
64GB      1
Name: Ram, dtype: int64

In [368]:
df["OpSys"].value_counts()

Windows 10      1072
No OS             66
Linux             62
Windows 7         45
Chrome OS         27
macOS             13
Mac OS X           8
Windows 10 S       8
Android            2
Name: OpSys, dtype: int64

# Preprocess

In [369]:
def RamParser(text):
    text = text.replace("GB", "")
    return int(text)

In [370]:
df["Ram"] = df["Ram"].apply(RamParser)

In [371]:
df["Ram"].value_counts()

8     619
4     375
16    200
6      41
12     25
2      22
32     17
24      3
64      1
Name: Ram, dtype: int64

In [372]:
def WeightParser(text):
    text = text.replace("kg", "")
    return float(text)

In [373]:
df["Weight"] = df["Weight"].apply(WeightParser)

In [374]:
df["Weight"].value_counts()

2.20    126
2.10     58
2.00     45
2.40     44
2.30     41
       ... 
4.50      1
1.14      1
3.80      1
3.25      1
2.34      1
Name: Weight, Length: 171, dtype: int64

In [375]:
def TouchscreenParser(text):
    if "Touchscreen" in text:
        return 1
    else:
        return 0

In [376]:
df["Touchscreen"] = df["ScreenResolution"].apply(TouchscreenParser)

In [377]:
df["Touchscreen"].value_counts()

0    1111
1     192
Name: Touchscreen, dtype: int64

In [378]:
def IPSParser(text):
    if "IPS" in text:
        return 1
    else:
        return 0

In [379]:
df["IPS"] = df["ScreenResolution"].apply(IPSParser)

In [380]:
df["IPS"].value_counts()

0    938
1    365
Name: IPS, dtype: int64

In [381]:
def ScreenParser(text):
    text = text[-9:]
    if text[0] == " ":
        return text[-8:]
    return text

In [None]:
df["ScreenResolution"] = df["ScreenResolution"].apply(ScreenParser)
resdf = df["ScreenResolution"].str.split("x", n=1, expand=True)
df["Wide"] = resdf[0]
df["Tall"] = resdf[1]
df["Wide"] = df["Wide"].astype("int64")
df["Tall"] = df["Tall"].astype("int64")

In [383]:
df["Wide"].value_counts()

1920    846
1366    308
3840     43
2560     29
3200     27
1600     23
2304      6
2256      6
1440      4
2880      4
2400      4
2160      2
2736      1
Name: Wide, dtype: int64

In [384]:
df["Tall"].value_counts()

1080    841
768     308
2160     43
1800     31
1440     31
900      27
1600     10
1504      6
1200      5
1824      1
Name: Tall, dtype: int64

In [386]:
df["PPI"] = pow(((pow(df["Wide"], 2)) + (pow(df["Tall"], 2))), 0.5) / (df["Inches"])

In [387]:
df.dtypes

Company              object
TypeName             object
Inches              float64
ScreenResolution     object
Cpu                  object
Ram                   int64
Memory               object
Gpu                  object
OpSys                object
Weight              float64
Price_euros         float64
Touchscreen           int64
IPS                   int64
Wide                  int64
Tall                  int64
PPI                 float64
dtype: object

In [388]:
def CPUParser(text):
    text = " ".join(text.split()[:3])
    if text.split()[0] == "Intel":
        if text == "Intel Core i3" or text == "Intel Core i5" or text == "Intel Core i7":
            return text
        else:
            return "Other Intel Processor"
    else:
        return "AMD Processor"

In [389]:
df["Cpu"] = df["Cpu"].apply(CPUParser)

In [391]:
def GPUParser(text):
    text = text.split()
    return text[0]

In [392]:
df["Gpu"] = df["Gpu"].apply(GPUParser)

In [394]:
df["OpSys"].value_counts()

Windows 10      1072
No OS             66
Linux             62
Windows 7         45
Chrome OS         27
macOS             13
Mac OS X           8
Windows 10 S       8
Android            2
Name: OpSys, dtype: int64

In [395]:
def OSParser(text):
    if text == "Windows 7" or text == "Windows 10" or text == " Windows 10 S":
        return "Windows"
    
    elif text == "macOS" or text == "Mac OS X":
        return "macOS"
    
    else:
        return "Freedos"

In [396]:
df["OpSys"] = df["OpSys"].apply(OSParser)

In [399]:
def MemoryParse1(text):
    text = text.replace("GB", "")
    text = text.replace("TB", "000")
    text = re.sub("\.0", "", text)
    return text

In [400]:
df["Memory"] = df["Memory"].apply(MemoryParse1)

In [403]:
def split_memory(row):
    pattern = r'\+'
    parts = re.split(pattern, row)
    return parts

In [None]:
texts = df["Memory"].tolist()
memory_ssd = np.zeros(1303).astype(int)
memory_hdd = np.zeros(1303).astype(int)
memory_hybrid = np.zeros(1303).astype(int)
memory_flash_storage = np.zeros(1303).astype(int)

for i in range(len(texts)):
    text_arr = split_memory(texts[i])
    print("Text Array: ", text_arr)
    for j in range(len(text_arr)):
        text = text_arr[j].strip(" ")
        print("Text:", text)
        if "SSD" in text:
            print(i, "SSD: ", text)
            text = text.replace("SSD", "")
            memory_ssd[i] += int(text)
            #memory_hdd.append(0)
            #memory_hybrid.append(0)
            #memory_flash_storage.append(0)
        elif "HDD" in text:
            print(i, "HDD: ", text)
            text = text.replace("HDD", "")
            #df["HDD"] = text
            #memory_ssd.append(0)
            memory_hdd[i] += int(text)
            #memory_hybrid.append(0)
            #memory_flash_storage.append(0)
        elif "Hybrid" in text:
            print(i, "Hybrid: ", text)
            #df["Hybrid"] = text
            text = text.replace("Hybrid", "")
            #memory_ssd.append(0)
            #memory_hdd.append(0)
            memory_hybrid[i] += int(text)
            #memory_flash_storage.append(0)
        elif "Flash Storage" in text:
            print(i, "Flash Storage: ", text)
            #df["Flash_Storage"] = text
            text = text.replace("Flash Storage", "")
            #memory_ssd.append(0)
            #memory_hdd.append(0)
            #memory_hybrid.append(0)
            memory_flash_storage[i] += int(text)

In [405]:
print(len(memory_ssd))
print(len(memory_hdd))
print(len(memory_hybrid))
print(len(memory_flash_storage))

1303
1303
1303
1303


In [406]:
memory_ssd[615]

128

In [407]:
len(df["Company"])

1303

In [408]:
df["SSD"] = memory_ssd
df["HDD"] = memory_hdd
df["Flash_Storage"] = memory_flash_storage
df["Hybrid"] = memory_hybrid

In [None]:
ndf = df.drop(["ScreenResolution", "Memory", "Hybrid", "Flash_Storage", "Inches", "Wide", "Tall"], axis=1)

In [411]:
ndf.to_csv("cleaned_data.csv")

# Model Training

In [424]:
ndf.dtypes

Company         object
TypeName        object
Cpu             object
Ram              int64
Gpu             object
OpSys           object
Weight         float64
Price_euros    float64
Touchscreen      int64
IPS              int64
PPI            float64
SSD              int64
HDD              int64
dtype: object

In [425]:
ndf["Company"] = ndf["Company"].astype("str")
ndf["TypeName"] = ndf["TypeName"].astype("str")
ndf["Cpu"] = ndf["Cpu"].astype("str")
#ndf["Ram"] = ndf["Ram"].astype("int")
ndf["Gpu"] = ndf["Gpu"].astype("str")
ndf["OpSys"] = ndf["OpSys"].astype("str")

In [426]:
X = ndf.drop(["Price_euros"], axis=1)
y = ndf["Price_euros"]

In [435]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=4242)

In [436]:
X_train.shape, X_test.shape

((1107, 12), (196, 12))

In [437]:
ndf.isnull().sum()

Company        0
TypeName       0
Cpu            0
Ram            0
Gpu            0
OpSys          0
Weight         0
Price_euros    0
Touchscreen    0
IPS            0
PPI            0
SSD            0
HDD            0
dtype: int64

In [438]:
def find_scores(model, y_test, y_pred):
    print("R2 Score:", r2_score(y_test, y_pred))
    print("MAE:", mean_absolute_error(y_test, y_pred))

In [442]:
# 0 -> Company
# 1 -> TypeName
# 2 -> Cpu
# 4 -> Gpu
# 5 -> OpSys

col_transformer = ColumnTransformer(transformers=[
    ("ohe", OneHotEncoder(sparse_output=False, drop="first"), [0, 1, 2, 4, 5])
], remainder="passthrough")

lin_reg = LinearRegression()

pipe1 = Pipeline([
    ("1", col_transformer),
    ("2", lin_reg)
])

pipe1.fit(X_train, y_train)
y_pred = pipe1.predict(X_test)

print("Linear Regression:")
find_scores(pipe1, y_test, y_pred)

Linear Regression:
R2 Score: 0.7968869033616763
MAE: 227.59817588259853


In [444]:
# 0 -> Company
# 1 -> TypeName
# 2 -> Cpu
# 4 -> Gpu
# 5 -> OpSys

col_transformer = ColumnTransformer(transformers=[
    ("ohe", OneHotEncoder(sparse_output=False, drop="first"), [0, 1, 2, 4, 5])
], remainder="passthrough")

lasso = Lasso(alpha=0.001)

pipe2 = Pipeline([
    ("1", col_transformer),
    ("2", lasso)
])

pipe2.fit(X_train, y_train)
y_pred = pipe2.predict(X_test)

print("Lasso:")
find_scores(pipe2, y_test, y_pred)

Lasso:
R2 Score: 0.7968949767386566
MAE: 227.59185469227245


In [447]:
# 0 -> Company
# 1 -> TypeName
# 2 -> Cpu
# 4 -> Gpu
# 5 -> OpSys

col_transformer = ColumnTransformer(transformers=[
    ("ohe", OneHotEncoder(sparse_output=False, drop="first"), [0, 1, 2, 4, 5])
], remainder="passthrough")

ridge = Ridge(alpha=10)

pipe3 = Pipeline([
    ("1", col_transformer),
    ("2", ridge)
])

pipe3.fit(X_train, y_train)
y_pred = pipe3.predict(X_test)

print("Ridge:")
find_scores(pipe3, y_test, y_pred)

Ridge:
R2 Score: 0.8034054672168616
MAE: 224.5453126168333


In [450]:
# 0 -> Company
# 1 -> TypeName
# 2 -> Cpu
# 4 -> Gpu
# 5 -> OpSys

col_transformer = ColumnTransformer(transformers=[
    ("ohe", OneHotEncoder(sparse_output=False, drop="first"), [0, 1, 2, 4, 5])
], remainder="passthrough")

dtr = DecisionTreeRegressor(max_depth=5)

pipe4 = Pipeline([
    ("1", col_transformer),
    ("2", dtr)
])

pipe4.fit(X_train, y_train)
y_pred = pipe4.predict(X_test)

print("DecisionTreeRegressor:")
find_scores(pipe4, y_test, y_pred)

DecisionTreeRegressor:
R2 Score: 0.7469597532318639
MAE: 235.72535061091375


In [451]:
# 0 -> Company
# 1 -> TypeName
# 2 -> Cpu
# 4 -> Gpu
# 5 -> OpSys

col_transformer = ColumnTransformer(transformers=[
    ("ohe", OneHotEncoder(sparse_output=False, drop="first"), [0, 1, 2, 4, 5])
], remainder="passthrough")

rfr = RandomForestRegressor(n_estimators=100, max_samples=0.5, max_features=0.75, max_depth=15, random_state=3)

pipe5 = Pipeline([
    ("1", col_transformer),
    ("2", rfr)
])

pipe5.fit(X_train, y_train)
y_pred = pipe5.predict(X_test)

print("RandomForestRegressor:")
find_scores(pipe5, y_test, y_pred)

RandomForestRegressor:
R2 Score: 0.8504089769515462
MAE: 183.39291769613385


In [453]:
pickle.dump(pipe5, open("RandomForestRegressor.pkl", "wb"))

In [461]:
X_train["Gpu"].value_counts()

Intel     616
Nvidia    338
AMD       152
ARM         1
Name: Gpu, dtype: int64