In [53]:
import numpy as np
import pandas as pd
import sklearn.compose as Compose

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_absolute_error

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import xgboost as xgb

Descargamos las datasets 

In [54]:
train_url = "https://raw.githubusercontent.com/w-dan/ML-practica1/main/train.csv"
df_train = pd.read_csv(train_url)

test_url = "https://raw.githubusercontent.com/w-dan/ML-practica1/main/test.csv"
df_test = pd.read_csv(test_url)

#df_test.Memory.unique()                       # Con esto vemos todos los valores posibles que tiene una columna

## Pre-procesamiento de datos de df_train

Vamos a guardar Price en la variable "y" y la vamos a separar del resto de columnas. 
También vamos a eliminar la columna "LaptopId" porque no nos aporta nada. 

In [55]:
y = df_train['Price']                              # Variable objetivo
df_train = df_train.drop('Price', axis=1)          # Todas las variables menos la variable objetivo
index = df_train['LaptopId']
df_train = df_train.drop('LaptopId', axis=1)       # Borramos la ID porque no aporta nada

df_train    

Unnamed: 0,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight
0,Toshiba,Portege Z30-C-1CV,Notebook,13.3,Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows 7,1.2kg
1,HP,Spectre 13-V111dx,Ultrabook,13.3,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 7500U 2.7GHz,8GB,256GB SSD,Intel HD Graphics 620,Windows 10,1.11kg
2,Dell,Inspiron 5570,Notebook,15.6,Full HD 1920x1080,Intel Core i7 8550U 1.8GHz,8GB,128GB SSD + 1TB HDD,Intel UHD Graphics 620,Windows 10,2.02kg
3,HP,Envy 13-ad009n,Ultrabook,13.3,IPS Panel Full HD 1920x1080,Intel Core i7 7500U 2.7GHz,8GB,256GB SSD,Nvidia GeForce MX150,Windows 10,1.38kg
4,Dell,Latitude 7280,Ultrabook,12.5,Full HD / Touchscreen 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics,Windows 10,1.36kg
...,...,...,...,...,...,...,...,...,...,...,...
907,Dell,Inspiron 5378,2 in 1 Convertible,13.3,Full HD / Touchscreen 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,Windows 10,1.68kg
908,Asus,FX753VD-GC007T (i7-7700HQ/8GB/1TB,Gaming,17.3,Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,8GB,128GB SSD + 1TB HDD,Nvidia GeForce GTX 1050,Windows 10,3kg
909,Dell,Inspiron 5567,Notebook,15.6,Full HD 1920x1080,Intel Core i7 7500U 2.7GHz,16GB,2TB HDD,AMD Radeon R7 M445,Windows 10,2.32kg
910,Dell,Inspiron 3567,Notebook,15.6,Full HD 1920x1080,Intel Core i7 7500U 2.7GHz,8GB,1TB HDD,AMD Radeon R5 M430,Linux,2.2kg


Como existen varios portátiles que disponen de dos tipos de memoria vamos a separar la columna "Memory" para distinguir entre estos tipos y su capacidad.
Lo distinguiremos en Memory1 y Memory2, cada una de ellas estará separada en su capacidad ("MemoryX_storage) y tipo ("MemoryX_type")

La tercera columna no nos aporta nada así que la borramos

###Tratamiento "Memory"

In [56]:
for index, row in df_train.iterrows():
   df_train.loc[index, 'Memory'].replace('Storage', '')

memory = df_train["Memory"].str.split(expand = True)
memory.columns = ['Memory1_storage', 'Memory1_type','borrar' ,'Memory2_storage', 'Memory2_type']

memory =  memory.drop('borrar', axis=1)       # borramos la tercera columna que no nos aporta nada.
memory

Unnamed: 0,Memory1_storage,Memory1_type,Memory2_storage,Memory2_type
0,128GB,SSD,,
1,256GB,SSD,,
2,128GB,SSD,1TB,HDD
3,256GB,SSD,,
4,256GB,SSD,,
...,...,...,...,...
907,256GB,SSD,,
908,128GB,SSD,1TB,HDD
909,2TB,HDD,,
910,1TB,HDD,,


Ahora vamos a eliminar la columna de "Memory" original que ya teníamos y vamos
a concatenar la nueva tabla con la anterior

Ahora vamos a convertir los TB a GB, para ello tenemos que recorrer las filas y sustituir los valores. Como el compilador no nos deja recorrer los datos si tenemos valores "None" vamos a convertirlo temporalmente a String para poder hacer esto.

Después de convertir los distintos datos a GB, vamos a covertir los valores "None", ya sean String o None, a un 0.

Ahora ya tendremos todos los datos de capacidad den Gigabytes

In [57]:
# Modificamos la antigua columna de Memory por las que hemos creado

df_train = df_train.drop('Memory', axis=1) 
df_train = pd.concat([df_train, memory], axis = 1)

# Convertimos todos los None en ceros

df_train['Memory2_storage'].replace(to_replace = [None], value = "None", inplace = True)
df_train['Memory1_storage'].replace(to_replace = [None], value = "None", inplace = True)

# Convertimos TB a GB

for index, row in df_train.iterrows():
   if 'TB' in row['Memory2_storage']:
     gb = int(df_train.loc[index, 'Memory2_storage'][0])*1024 
     df_train.loc[index, 'Memory2_storage'] = f"{gb}GB"


for index, row in df_train.iterrows():
   if 'TB' in row['Memory1_storage']:
     gb = int(df_train.loc[index, 'Memory1_storage'][0])*1024
     df_train.loc[index, 'Memory1_storage'] = f"{gb}GB"


df_train['Memory2_storage'].replace(to_replace = ["None"], value = 0, inplace = True)
df_train['Memory1_storage'].replace(to_replace = ["None"], value = 0, inplace = True)
df_train['Memory2_type'].replace(to_replace = [None], value = "0", inplace = True)                  # El 0 lo pongo como String para que luego al convertir los datos lo haga bien
df_train['Memory1_type'].replace(to_replace = [None], value = "0", inplace = True)


df_train

Unnamed: 0,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Gpu,OpSys,Weight,Memory1_storage,Memory1_type,Memory2_storage,Memory2_type
0,Toshiba,Portege Z30-C-1CV,Notebook,13.3,Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,4GB,Intel HD Graphics 520,Windows 7,1.2kg,128GB,SSD,0,0
1,HP,Spectre 13-V111dx,Ultrabook,13.3,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 7500U 2.7GHz,8GB,Intel HD Graphics 620,Windows 10,1.11kg,256GB,SSD,0,0
2,Dell,Inspiron 5570,Notebook,15.6,Full HD 1920x1080,Intel Core i7 8550U 1.8GHz,8GB,Intel UHD Graphics 620,Windows 10,2.02kg,128GB,SSD,1024GB,HDD
3,HP,Envy 13-ad009n,Ultrabook,13.3,IPS Panel Full HD 1920x1080,Intel Core i7 7500U 2.7GHz,8GB,Nvidia GeForce MX150,Windows 10,1.38kg,256GB,SSD,0,0
4,Dell,Latitude 7280,Ultrabook,12.5,Full HD / Touchscreen 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,Intel HD Graphics,Windows 10,1.36kg,256GB,SSD,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
907,Dell,Inspiron 5378,2 in 1 Convertible,13.3,Full HD / Touchscreen 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,Intel HD Graphics 620,Windows 10,1.68kg,256GB,SSD,0,0
908,Asus,FX753VD-GC007T (i7-7700HQ/8GB/1TB,Gaming,17.3,Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,8GB,Nvidia GeForce GTX 1050,Windows 10,3kg,128GB,SSD,1024GB,HDD
909,Dell,Inspiron 5567,Notebook,15.6,Full HD 1920x1080,Intel Core i7 7500U 2.7GHz,16GB,AMD Radeon R7 M445,Windows 10,2.32kg,2048GB,HDD,0,0
910,Dell,Inspiron 3567,Notebook,15.6,Full HD 1920x1080,Intel Core i7 7500U 2.7GHz,8GB,AMD Radeon R5 M430,Linux,2.2kg,1024GB,HDD,0,0


Ahora vamos a hacer como con la RAM, eliminar la subcadena "GB" de los discos. Para ello usamos el mismo método de antes, pero surge un problema, en los portátiles que no tienen alguno de sus discos se nos borra el valor 0.

Esto tiene fácil solución, usamos una función para que nos sustituya los valores vacíos por 0.

###Normalizando

Vamos a convertir los valores de "Typename", "Company", "OpSys", etc... en números para que se pueda trabajar con ellos.

También vamos a eliminar "GB" de los elementos de la columna RAM y las memorias al igual que vamos a eliminar "kg" de la columna "Weight" para dejar sólo los números    

Con las memorias surge un problema, en los portátiles que no tienen alguno de sus discos se nos borra el valor 0.

Esto tiene fácil solución, usamos una función para que nos sustituya los valores vacíos por 0.

Normalizamos las columnas que tenemos

In [58]:
df_train['Ram'] = df_train['Ram'].map(lambda x: str(x)[:-2])                                # Eliminamos dos últimos caracteres del contenido de 'Ram'
df_train['Weight'] = df_train['Weight'].map(lambda x: str(x)[:-2])                          # Igual con 'Weight'
df_train['Memory1_storage'] = df_train['Memory1_storage'].map(lambda x: str(x)[:-2])       
df_train['Memory2_storage'] = df_train['Memory2_storage'].map(lambda x: str(x)[:-2])

df_train = df_train.replace(r'^\s*$', value = 0, regex=True)                                #Cambia los espacios vacios por 0   ^\s*$ es espacio vacío

# Transformamos los valores del tipo de memoria para que sean tratables

enc = OrdinalEncoder()
df_train[['Memory1_type', 'Memory2_type', 'TypeName', 'Company', 'OpSys', 'Product', 'ScreenResolution', 'Cpu', 'Gpu']] = enc.fit_transform(df_train[['Memory1_type', 'Memory2_type', 'TypeName', 'Company', 'OpSys', 'Product', 'ScreenResolution', 'Cpu', 'Gpu']])

# Normalizamos todos los valores númericos que tenemos de momento
df_train[['Company', 'TypeName', 'Inches', 'Ram', 'Memory1_storage', 'Memory1_type', 'Memory2_storage', 'Memory2_type', 'OpSys', 'Weight', 'Cpu', 'Gpu', 'ScreenResolution', 'Product']] = MinMaxScaler().fit_transform(df_train[['Company', 'TypeName', 'Inches', 'Ram', 'Memory1_storage', 'Memory1_type', 'Memory2_storage', 'Memory2_type', 'OpSys', 'Weight', 'Cpu', 'Gpu', 'ScreenResolution', 'Product']])

df_train

Tratamiento de la columna CPU

Primero separamos las cadenas de caracteteres

In [59]:
prueba = df_train['Cpu'].str.split()

A continuación vamos a guardar en un array la parte que queremos separar

In [60]:
Cpu_Company = [x[0] for x in prueba]
Cpu_Model = [" ".join(x[1:-2]) for x in prueba]
Cpu_Frecuency = [x[-1] for x in prueba]

Después añadimos al Dataset las nuevas columnas que hemos creado

In [61]:
df_train['Cpu_Company'] = pd.DataFrame(Cpu_Company)
df_train['Cpu_Model'] = pd.DataFrame(Cpu_Model)
df_train['Cpu_Frecuency(GHz)'] = pd.DataFrame(Cpu_Frecuency)

Por último eliminamos las unidades de la columna de Cpu_Frecuency

In [62]:
df_train['Cpu_Frecuency(GHz)'] = df_train['Cpu_Frecuency(GHz)'].map(lambda x: str(x)[:-3])
df_train = df_train.drop('Cpu', axis=1)
df_train

Unnamed: 0,Company,Product,TypeName,Inches,ScreenResolution,Ram,Gpu,OpSys,Weight,Memory1_storage,Memory1_type,Memory2_storage,Memory2_type,Cpu_Company,Cpu_Model,Cpu_Frecuency(GHz)
0,Toshiba,Portege Z30-C-1CV,Notebook,13.3,Full HD 1920x1080,4,Intel HD Graphics 520,Windows 7,1.2,128,SSD,0,0,Intel,Core i5,2.3
1,HP,Spectre 13-V111dx,Ultrabook,13.3,IPS Panel Full HD / Touchscreen 1920x1080,8,Intel HD Graphics 620,Windows 10,1.11,256,SSD,0,0,Intel,Core i7,2.7
2,Dell,Inspiron 5570,Notebook,15.6,Full HD 1920x1080,8,Intel UHD Graphics 620,Windows 10,2.02,128,SSD,1024,HDD,Intel,Core i7,1.8
3,HP,Envy 13-ad009n,Ultrabook,13.3,IPS Panel Full HD 1920x1080,8,Nvidia GeForce MX150,Windows 10,1.38,256,SSD,0,0,Intel,Core i7,2.7
4,Dell,Latitude 7280,Ultrabook,12.5,Full HD / Touchscreen 1920x1080,8,Intel HD Graphics,Windows 10,1.36,256,SSD,0,0,Intel,Core i5,2.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
907,Dell,Inspiron 5378,2 in 1 Convertible,13.3,Full HD / Touchscreen 1920x1080,8,Intel HD Graphics 620,Windows 10,1.68,256,SSD,0,0,Intel,Core i5,2.5
908,Asus,FX753VD-GC007T (i7-7700HQ/8GB/1TB,Gaming,17.3,Full HD 1920x1080,8,Nvidia GeForce GTX 1050,Windows 10,3,128,SSD,1024,HDD,Intel,Core i7,2.8
909,Dell,Inspiron 5567,Notebook,15.6,Full HD 1920x1080,16,AMD Radeon R7 M445,Windows 10,2.32,2048,HDD,0,0,Intel,Core i7,2.7
910,Dell,Inspiron 3567,Notebook,15.6,Full HD 1920x1080,8,AMD Radeon R5 M430,Linux,2.2,1024,HDD,0,0,Intel,Core i7,2.7


Ahora vamos a tratar la columna de ScreenResolution separando la resolución de la tecnología que usan. Para ello vamos a seguir el mismo procedimiento que seguimos con la columna CPU

In [63]:
prueba = df_train['ScreenResolution'].str.split()
screen_resolution = [x[-1:] for x in prueba]
screen_tecnology = [" ".join(x[0:-1]) for x in prueba]
df_train['Screen_Resolution'] = pd.DataFrame(screen_resolution)
df_train['Screen_Technology'] = pd.DataFrame(screen_tecnology)
df_train = df_train.drop('ScreenResolution', axis=1)
df_train

Unnamed: 0,Company,Product,TypeName,Inches,Ram,Gpu,OpSys,Weight,Memory1_storage,Memory1_type,Memory2_storage,Memory2_type,Cpu_Company,Cpu_Model,Cpu_Frecuency(GHz),Screen_Resolution,Screen_Technology
0,Toshiba,Portege Z30-C-1CV,Notebook,13.3,4,Intel HD Graphics 520,Windows 7,1.2,128,SSD,0,0,Intel,Core i5,2.3,1920x1080,Full HD
1,HP,Spectre 13-V111dx,Ultrabook,13.3,8,Intel HD Graphics 620,Windows 10,1.11,256,SSD,0,0,Intel,Core i7,2.7,1920x1080,IPS Panel Full HD / Touchscreen
2,Dell,Inspiron 5570,Notebook,15.6,8,Intel UHD Graphics 620,Windows 10,2.02,128,SSD,1024,HDD,Intel,Core i7,1.8,1920x1080,Full HD
3,HP,Envy 13-ad009n,Ultrabook,13.3,8,Nvidia GeForce MX150,Windows 10,1.38,256,SSD,0,0,Intel,Core i7,2.7,1920x1080,IPS Panel Full HD
4,Dell,Latitude 7280,Ultrabook,12.5,8,Intel HD Graphics,Windows 10,1.36,256,SSD,0,0,Intel,Core i5,2.5,1920x1080,Full HD / Touchscreen
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
907,Dell,Inspiron 5378,2 in 1 Convertible,13.3,8,Intel HD Graphics 620,Windows 10,1.68,256,SSD,0,0,Intel,Core i5,2.5,1920x1080,Full HD / Touchscreen
908,Asus,FX753VD-GC007T (i7-7700HQ/8GB/1TB,Gaming,17.3,8,Nvidia GeForce GTX 1050,Windows 10,3,128,SSD,1024,HDD,Intel,Core i7,2.8,1920x1080,Full HD
909,Dell,Inspiron 5567,Notebook,15.6,16,AMD Radeon R7 M445,Windows 10,2.32,2048,HDD,0,0,Intel,Core i7,2.7,1920x1080,Full HD
910,Dell,Inspiron 3567,Notebook,15.6,8,AMD Radeon R5 M430,Linux,2.2,1024,HDD,0,0,Intel,Core i7,2.7,1920x1080,Full HD


In [71]:
prueba = df_train['Gpu'].str.split()
Gpu_Company = [x[0] for x in prueba]
Gpu_Model = [" ".join(x[1:]) for x in prueba]
df_train['Gpu_Company'] = pd.DataFrame(Gpu_Company)
df_train['Gpu_Model'] = pd.DataFrame(Gpu_Model)
df_train = df_train.drop('Gpu', axis=1)
df_train

Unnamed: 0,Company,Product,TypeName,Inches,Ram,OpSys,Weight,Memory1_storage,Memory1_type,Memory2_storage,Memory2_type,Cpu_Company,Cpu_Model,Cpu_Frecuency(GHz),Screen_Resolution,Screen_Technology,Gpu_Company,Gpu_Model
0,Toshiba,Portege Z30-C-1CV,Notebook,13.3,4,Windows 7,1.2,128,SSD,0,0,Intel,Core i5,2.3,1920x1080,Full HD,Intel,HD Graphics 520
1,HP,Spectre 13-V111dx,Ultrabook,13.3,8,Windows 10,1.11,256,SSD,0,0,Intel,Core i7,2.7,1920x1080,IPS Panel Full HD / Touchscreen,Intel,HD Graphics 620
2,Dell,Inspiron 5570,Notebook,15.6,8,Windows 10,2.02,128,SSD,1024,HDD,Intel,Core i7,1.8,1920x1080,Full HD,Intel,UHD Graphics 620
3,HP,Envy 13-ad009n,Ultrabook,13.3,8,Windows 10,1.38,256,SSD,0,0,Intel,Core i7,2.7,1920x1080,IPS Panel Full HD,Nvidia,GeForce MX150
4,Dell,Latitude 7280,Ultrabook,12.5,8,Windows 10,1.36,256,SSD,0,0,Intel,Core i5,2.5,1920x1080,Full HD / Touchscreen,Intel,HD Graphics
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
907,Dell,Inspiron 5378,2 in 1 Convertible,13.3,8,Windows 10,1.68,256,SSD,0,0,Intel,Core i5,2.5,1920x1080,Full HD / Touchscreen,Intel,HD Graphics 620
908,Asus,FX753VD-GC007T (i7-7700HQ/8GB/1TB,Gaming,17.3,8,Windows 10,3,128,SSD,1024,HDD,Intel,Core i7,2.8,1920x1080,Full HD,Nvidia,GeForce GTX 1050
909,Dell,Inspiron 5567,Notebook,15.6,16,Windows 10,2.32,2048,HDD,0,0,Intel,Core i7,2.7,1920x1080,Full HD,AMD,Radeon R7 M445
910,Dell,Inspiron 3567,Notebook,15.6,8,Linux,2.2,1024,HDD,0,0,Intel,Core i7,2.7,1920x1080,Full HD,AMD,Radeon R5 M430


Preprocesamiento

In [72]:
df_train['Product'].unique()

array(['Portege Z30-C-1CV', 'Spectre 13-V111dx', 'Inspiron 5570',
       'Envy 13-ad009n', 'Latitude 7280',
       'B51-80 (i7-6500U/8GB/1008GB/Radeon', 'IdeaPad 320-15ISK',
       'ZenBook UX410UA-GV183T', 'G752VY-GC162T (i7-6700HQ/16GB/1TB',
       'Surface Laptop', 'Zenbook 3', 'V131 (X5-Z8350/4GB/32GB/FHD/W10)',
       'GS63VR 7RF', '17-bs000nv I3', 'ProBook 450', 'Probook 430',
       'Aspire 3', 'Yoga 910-13IKB', 'ThinkPad T470s', 'Probook 640',
       'XPS 13', 'Omen 15-AX205na', 'Aspire 5', 'MacBook 12"',
       'K146 (N3350/4GB/32GB/W10)', '250 G6', '250 G5', 'Inspiron 3567',
       'IdeaPad 320-15IKBN', 'Thinkpad X270', 'ROG Strix',
       'ROG GL703VD-GC028T', 'Inspiron 5379',
       'A541NA-GO342 (N3350/4GB/500GB/Linux)', 'ProBook 430',
       'SmartBook 140', 'Stream 14-AX040wm',
       'V330-15IKB (i5-8250U/8GB/256GB/FHD/W10)', 'Aspire ES1-523',
       'Thinkpad E470', 'ThinkPad P51s', 'Zbook 17', 'Chromebook Plus',
       'Q524UQ-BHI7T15 (i7-7500U/12GB/2TB/GeForce',
    

In [None]:
column_transformer = Compose.ColumnTransformer(transformers=[
    ("Company", OrdinalEncoder(), [0])
])

## Pre-procesamiento de datos de df_test

In [64]:
df_test = df_test.drop('LaptopId', axis=1)       # Borramos la ID porque no aporta nada

df_test

Unnamed: 0,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight
0,HP,Envy x360,2 in 1 Convertible,13.3,Quad HD+ / Touchscreen 3200x1800,Intel Core i7 7500U 2.7GHz,16GB,256GB SSD,Intel HD Graphics 620,Windows 10,1.42kg
1,Lenovo,ThinkPad X1,2 in 1 Convertible,14.0,IPS Panel Touchscreen 2560x1440,Intel Core i7 6500U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 520,Windows 10,1.27kg
2,Acer,Aspire F5-573G-510L,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,12GB,128GB SSD + 1TB HDD,Nvidia GeForce GTX 950M,Windows 10,2.4kg
3,Asus,FX502VM-DM105T (i7-6700HQ/8GB/1TB/GeForce,Gaming,15.6,Full HD 1920x1080,Intel Core i7 6700HQ 2.6GHz,8GB,1TB HDD,Nvidia GeForce GTX 1060,Windows 10,2.2kg
4,HP,EliteBook 850,Ultrabook,15.6,Full HD 1920x1080,Intel Core i7 6500U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 520,Windows 7,1.88kg
...,...,...,...,...,...,...,...,...,...,...,...
386,HP,Probook 470,Notebook,17.3,Full HD 1920x1080,Intel Core i7 8550U 1.8GHz,8GB,1TB HDD,Nvidia GeForce 930MX,Windows 10,2.5kg
387,Dell,Inspiron 3552,Notebook,15.6,1366x768,Intel Celeron Dual Core N3050 1.6GHz,2GB,500GB HDD,Intel HD Graphics,Windows 10,2.20kg
388,Asus,ZenBook UX530UQ-PRO,Ultrabook,15.6,Full HD 1920x1080,Intel Core i7 7500U 2.7GHz,16GB,512GB SSD,Nvidia GeForce 940MX,Windows 10,1.63kg
389,Dell,Inspiron 3576,Notebook,15.6,Full HD 1920x1080,Intel Core i5 8250U 1.6GHz,8GB,256GB SSD,AMD Radeon 520,Windows 10,2.13kg


###Tratamiento "Memory"

Como existen varios portátiles que disponen de dos tipos de memoria vamos a separar la columna "Memory" para distinguir entre estos tipos y su capacidad. Lo distinguiremos en Memory1 y Memory2, cada una de ellas estará separada en su capacidad ("MemoryX_storage) y tipo ("MemoryX_type")

La tercera columna no nos aporta nada así que la borramos

In [65]:
for index, row in df_test.iterrows():
  df_test.loc[index, 'Memory'] = df_test.loc[index, 'Memory'].replace("Storage", "")

memoryt = df_test["Memory"].str.split(expand = True)
memoryt.columns = ['Memory1_storage', 'Memory1_type','borrar' ,'Memory2_storage', 'Memory2_type']

memoryt =  memoryt.drop('borrar', axis=1)       # borramos la tercera columna que no nos aporta nada.
memoryt

Unnamed: 0,Memory1_storage,Memory1_type,Memory2_storage,Memory2_type
0,256GB,SSD,,
1,256GB,SSD,,
2,128GB,SSD,1TB,HDD
3,1TB,HDD,,
4,256GB,SSD,,
...,...,...,...,...
386,1TB,HDD,,
387,500GB,HDD,,
388,512GB,SSD,,
389,256GB,SSD,,


Ahora vamos a convertir los TB a GB, para ello tenemos que recorrer las filas y sustituir los valores. Como el compilador no nos deja recorrer los datos si tenemos valores "None" vamos a convertirlo temporalmente a String para poder hacer esto.

Después de convertir los distintos datos a GB, vamos a covertir los valores "None", ya sean String o None, a un 0.

Ahora ya tendremos todos los datos de capacidad den Gigabytes

In [66]:
# Sustituimos la antigua columna de Memory por las creadas anteriormente

df_test = df_test.drop('Memory', axis=1) 
df_test = pd.concat([df_test, memoryt], axis = 1)
df_test

# Convertimos todos los None en ceros

df_test['Memory2_storage'].replace(to_replace = [None], value = "None", inplace = True)
df_test['Memory1_storage'].replace(to_replace = [None], value = "None", inplace = True)

# Convertimos de TB a GB

for index, row in df_test.iterrows():
   if 'TB' in row['Memory2_storage']:
     gb = int(df_test.loc[index, 'Memory2_storage'][0])*1024 
     df_test.loc[index, 'Memory2_storage'] = f"{gb}GB"


for index, row in df_test.iterrows():
   if 'TB' in row['Memory1_storage']:
     gb = int(df_test.loc[index, 'Memory1_storage'][0])*1024
     df_test.loc[index, 'Memory1_storage'] = f"{gb}GB"


df_test['Memory2_storage'].replace(to_replace = ["None"], value = 0, inplace = True)
df_test['Memory1_storage'].replace(to_replace = ["None"], value = 0, inplace = True)
df_test['Memory2_type'].replace(to_replace = [None], value = "0", inplace = True)                  # El 0 lo pongo como String para que luego al convertir los datos lo haga bien
df_test['Memory1_type'].replace(to_replace = [None], value = "0", inplace = True)

df_test

Unnamed: 0,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Gpu,OpSys,Weight,Memory1_storage,Memory1_type,Memory2_storage,Memory2_type
0,HP,Envy x360,2 in 1 Convertible,13.3,Quad HD+ / Touchscreen 3200x1800,Intel Core i7 7500U 2.7GHz,16GB,Intel HD Graphics 620,Windows 10,1.42kg,256GB,SSD,0,0
1,Lenovo,ThinkPad X1,2 in 1 Convertible,14.0,IPS Panel Touchscreen 2560x1440,Intel Core i7 6500U 2.5GHz,8GB,Intel HD Graphics 520,Windows 10,1.27kg,256GB,SSD,0,0
2,Acer,Aspire F5-573G-510L,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,12GB,Nvidia GeForce GTX 950M,Windows 10,2.4kg,128GB,SSD,1024GB,HDD
3,Asus,FX502VM-DM105T (i7-6700HQ/8GB/1TB/GeForce,Gaming,15.6,Full HD 1920x1080,Intel Core i7 6700HQ 2.6GHz,8GB,Nvidia GeForce GTX 1060,Windows 10,2.2kg,1024GB,HDD,0,0
4,HP,EliteBook 850,Ultrabook,15.6,Full HD 1920x1080,Intel Core i7 6500U 2.5GHz,8GB,Intel HD Graphics 520,Windows 7,1.88kg,256GB,SSD,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
386,HP,Probook 470,Notebook,17.3,Full HD 1920x1080,Intel Core i7 8550U 1.8GHz,8GB,Nvidia GeForce 930MX,Windows 10,2.5kg,1024GB,HDD,0,0
387,Dell,Inspiron 3552,Notebook,15.6,1366x768,Intel Celeron Dual Core N3050 1.6GHz,2GB,Intel HD Graphics,Windows 10,2.20kg,500GB,HDD,0,0
388,Asus,ZenBook UX530UQ-PRO,Ultrabook,15.6,Full HD 1920x1080,Intel Core i7 7500U 2.7GHz,16GB,Nvidia GeForce 940MX,Windows 10,1.63kg,512GB,SSD,0,0
389,Dell,Inspiron 3576,Notebook,15.6,Full HD 1920x1080,Intel Core i5 8250U 1.6GHz,8GB,AMD Radeon 520,Windows 10,2.13kg,256GB,SSD,0,0


Ahora vamos a hacer como con la RAM, eliminar la subcadena "GB" de los discos. Para ello usamos el mismo método de antes, pero 

###Normalizando

Vamos a convertir los valores de "Typename", "Company", "OpSys", etc... en números para que se pueda trabajar con ellos.

También vamos a eliminar "GB" de los elementos de la columna RAM y las memorias al igual que vamos a eliminar "kg" de la columna "Weight" para dejar sólo los números    

Con las memorias surge un problema, en los portátiles que no tienen alguno de sus discos se nos borra el valor 0.

Esto tiene fácil solución, usamos una función para que nos sustituya los valores vacíos por 0.

Normalizamos las columnas que tenemos

In [67]:
df_test['Ram'] = df_test['Ram'].map(lambda x: str(x)[:-2])                     # Eliminamos dos últimos caracteres del contenido de 'Ram'
df_test['Weight'] = df_test['Weight'].map(lambda x: str(x)[:-2])               # Igual con 'Weight'
df_test['Memory1_storage'] = df_test['Memory1_storage'].map(lambda x: str(x)[:-2])
df_test['Memory2_storage'] = df_test['Memory2_storage'].map(lambda x: str(x)[:-2])

df_test = df_test.replace(r'^\s*$', value = 0, regex=True)                       #Cambia los espacios vacios por 0   ^\s*$ es espacio vacío

# Transformamos los valores del tipo de memoria para que sean tratables
df_test[['TypeName', 'Company', 'OpSys', 'Product', 'ScreenResolution', 'Cpu', 'Gpu', 'Memory1_type', 'Memory2_type']] = enc.fit_transform(df_test[['TypeName', 'Company', 'OpSys', 'Product', 'ScreenResolution', 'Cpu', 'Gpu','Memory1_type', 'Memory2_type']])

# Normalizamos todos los valores númericos que tenemos de momento
df_test[['Company', 'TypeName', 'Inches', 'Ram', 'Memory1_storage', 'Memory1_type', 'Memory2_storage', 'Memory2_type', 'OpSys', 'Weight', 'Cpu', 'Gpu', 'ScreenResolution', 'Product']] = MinMaxScaler().fit_transform(df_test[['Company', 'TypeName', 'Inches', 'Ram', 'Memory1_storage', 'Memory1_type', 'Memory2_storage', 'Memory2_type', 'OpSys', 'Weight', 'Cpu', 'Gpu', 'ScreenResolution', 'Product']])

df_test

Unnamed: 0,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Gpu,OpSys,Weight,Memory1_storage,Memory1_type,Memory2_storage,Memory2_type
0,0.416667,0.201550,0.0,0.385542,0.896552,0.833333,0.225806,0.515152,0.666667,0.186701,0.121569,1.000000,0.0,0.000000
1,0.500000,0.751938,0.0,0.469880,0.862069,0.733333,0.096774,0.439394,0.666667,0.148338,0.121569,1.000000,0.0,0.000000
2,0.000000,0.116279,0.6,0.662651,0.275862,0.600000,0.161290,0.803030,0.666667,0.437340,0.058824,1.000000,0.5,0.333333
3,0.166667,0.213178,0.2,0.662651,0.275862,0.766667,0.096774,0.757576,0.666667,0.386189,0.498039,0.333333,0.0,0.000000
4,0.416667,0.178295,0.8,0.662651,0.275862,0.733333,0.096774,0.439394,0.833333,0.304348,0.121569,1.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
386,0.416667,0.627907,0.6,0.867470,0.275862,0.933333,0.096774,0.681818,0.666667,0.462916,0.498039,0.333333,0.0,0.000000
387,0.333333,0.368217,0.6,0.662651,0.000000,0.250000,0.000000,0.333333,0.666667,0.386189,0.241176,0.333333,0.0,0.000000
388,0.166667,0.988372,0.8,0.662651,0.275862,0.833333,0.225806,0.696970,0.666667,0.240409,0.247059,1.000000,0.0,0.000000
389,0.333333,0.375969,0.6,0.662651,0.275862,0.666667,0.096774,0.045455,0.666667,0.368286,0.121569,1.000000,0.0,0.000000


##XGBOOST

In [68]:
X_train = df_train[['Ram', 'Memory1_storage', 'Memory1_type', 'Memory2_storage', 'Memory2_type', 'Cpu', 'Gpu', 'ScreenResolution', 'Company', 'OpSys', 'Weight', 'Product', 'Inches', 'TypeName']]
X_test = df_test[['Ram', 'Memory1_storage', 'Memory1_type', 'Memory2_storage', 'Memory2_type', 'Cpu', 'Gpu', 'ScreenResolution', 'Company', 'OpSys', 'Weight', 'Product', 'Inches', 'TypeName']]

model = XGBRegressor()

# Fitting the model
model.fit(X_train, y)

# Pintamos la importancia de cada una de las columnas

%matplotlib inline
xgb.plot_importance(model, ax=plt.gca())

KeyError: "['Cpu', 'ScreenResolution'] not in index"

In [None]:
predTrain = model.predict(X_train)        # Predicción del Train
pred = model.predict(X_test)              # Predicción del Test

print("MAE: ", mean_absolute_error(y, predTrain))

In [None]:
pred

##Otras cosas:

### Dudas:

- ¿Se puede usar XGBoost para esta práctica?

### Cosas que hacer (Cynthia):

- Hacer lo de las Seed y eso para XGBoost


### Bibliografía

Sitios de los que he sacado infomación y otros que pueden venir bien luego



split:

https://www.analyticslane.com/2020/10/19/separar-texto-en-columnas-con-pandas-en-python/


iterar elementos del Dataframe:

https://www.analyticslane.com/2021/09/13/pandas-como-iterar-sobre-las-filas-de-un-dataframe-en-pandas/


pandas:

https://guias.makeitreal.camp/pandas/inspeccion-y-seleccion-de-datos


eliminar duplicados (Aquí no hay):

https://cursosinformatica.ucm.es/trial/analisis/