# Análisis de datos con Python.
## Introducción
* Importar librerías de python

In [37]:
import requests
import pandas as pd
import numpy as np

URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
df = pd.read_csv(URL, header=None)  # Nuestro dataset no tiene cabecera
df.head()  # visualización de las primeras 5 filas del dataset


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


## Preprocesamiento de datos
* Asignación de encabezados.

In [44]:
headers = ["symboling", "normalized-losses", "make", "fuel-type", "aspiration",
           "num-of-doors", "body-style", "drive-wheels", "engine-location",
           "wheel-base", "length", "width", "height", "curb-weight",
           "engine-type", "num-of-cylinders", "engine-size", "fuel-system",
           "bore", "stroke", "compression-ratio", "horsepower", "peak-rpm",
           "city-mpg", "highway-mpg", "price"]
df.columns = headers  # remplazo el encabezado de las columnas por mi lista de headers
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,122.0,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


* Conociendo el dataset

In [45]:
df.dtypes  # visualización de los tipos de datos de cada columna

symboling              int64
normalized-losses    float64
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                  object
stroke                object
compression-ratio    float64
horsepower            object
peak-rpm              object
city-mpg               int64
highway-mpg            int64
price                 object
dtype: object

In [46]:
df.describe()  # descripción estadística de las columnas numéricas

Unnamed: 0,symboling,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,compression-ratio,city-mpg,highway-mpg
count,201.0,201.0,201.0,201.0,201.0,201.0,201.0,201.0,201.0,201.0,201.0
mean,0.840796,122.0,98.797015,174.200995,65.889055,53.766667,2555.666667,126.875622,10.164279,25.179104,30.686567
std,1.254802,31.99625,6.066366,12.322175,2.101471,2.447822,517.296727,41.546834,4.004965,6.42322,6.81515
min,-2.0,65.0,86.6,141.1,60.3,47.8,1488.0,61.0,7.0,13.0,16.0
25%,0.0,101.0,94.5,166.8,64.1,52.0,2169.0,98.0,8.6,19.0,25.0
50%,1.0,122.0,97.0,173.2,65.5,54.1,2414.0,120.0,9.0,24.0,30.0
75%,2.0,137.0,102.4,183.5,66.6,55.5,2926.0,141.0,9.4,30.0,34.0
max,3.0,256.0,120.9,208.1,72.0,59.8,4066.0,326.0,23.0,49.0,54.0


In [47]:
# si deseo conocer estadísticas de todas las columnas, incluyendo las no numéricas
df.describe(include="all")

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
count,201.0,201.0,201,201,201,199,201,201,201,201.0,...,201.0,201,197.0,197.0,201.0,199.0,199.0,201.0,201.0,201.0
unique,,,22,2,2,2,5,3,2,,...,,8,38.0,36.0,,58.0,22.0,,,186.0
top,,,toyota,gas,std,four,sedan,fwd,front,,...,,mpfi,3.62,3.4,,68.0,5500.0,,,16500.0
freq,,,32,181,165,113,94,118,198,,...,,92,23.0,19.0,,19.0,36.0,,,2.0
mean,0.840796,122.0,,,,,,,,98.797015,...,126.875622,,,,10.164279,,,25.179104,30.686567,
std,1.254802,31.99625,,,,,,,,6.066366,...,41.546834,,,,4.004965,,,6.42322,6.81515,
min,-2.0,65.0,,,,,,,,86.6,...,61.0,,,,7.0,,,13.0,16.0,
25%,0.0,101.0,,,,,,,,94.5,...,98.0,,,,8.6,,,19.0,25.0,
50%,1.0,122.0,,,,,,,,97.0,...,120.0,,,,9.0,,,24.0,30.0,
75%,2.0,137.0,,,,,,,,102.4,...,141.0,,,,9.4,,,30.0,34.0,


* Pre-procesamiento de datos (data cleaning, data wrangling)

*manejo de valores nulos*

In [48]:
df.replace("?", np.nan, inplace=True)  # remplazo los valores "?" por numpy not a number
df.isnull().sum()  # visualización de la cantidad de valores nulos por columna

symboling            0
normalized-losses    0
make                 0
fuel-type            0
aspiration           0
num-of-doors         2
body-style           0
drive-wheels         0
engine-location      0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-type          0
num-of-cylinders     0
engine-size          0
fuel-system          0
bore                 4
stroke               4
compression-ratio    0
horsepower           2
peak-rpm             2
city-mpg             0
highway-mpg          0
price                0
dtype: int64

In [52]:
df = df.dropna(subset=["price"], axis=0)  # elimino las filas (index=0) con valores nulos en la columna "price"

# convierto la columna a tipo float
df["normalized-losses"] = df["normalized-losses"].astype(float)  
df["stroke"] = df["stroke"].astype(float)
df["bore"] = df["bore"].astype(float)
df["horsepower"] = df["horsepower"].astype(float)
df["peak-rpm"] = df["peak-rpm"].astype(float)

# reemplazo los valores nulos por la media
df["normalized-losses"] = df["normalized-losses"].fillna(df["normalized-losses"].mean())  
df["stroke"] = df["stroke"].fillna(df["stroke"].mean())
df["bore"] = df["bore"].fillna(df["bore"].mean())
df["horsepower"] = df["horsepower"].fillna(df["horsepower"].mean())
df["peak-rpm"] = df["peak-rpm"].fillna(df["peak-rpm"].mean())

# reemplazo los valores nulos por frecuencia
df["num-of-doors"] = df["num-of-doors"].fillna('four')