# Importar Liberías

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importar archivo csv

In [3]:
datos = pd.read_csv('Data.csv')

In [4]:
datos

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## Datos independientes 
Caracteristicas de los individuos observados

In [5]:
X = datos.iloc[:, :3].values

In [6]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## Datos dependientes

In [9]:
Y = datos.iloc[:, -1].values

In [10]:
Y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

# Tratamientos de los NaN

In [11]:
from sklearn.impute import SimpleImputer

In [12]:
imputer = SimpleImputer(missing_values = np.nan, strategy = "mean", verbose=0)


In [13]:
imputer

In [14]:
imputer = imputer.fit(X[:,1:3]) 




In [142]:
X[:, 1:3] = imputer.transform(X[:,1:3])

In [143]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## Otra forma de tratar los Nan

In [147]:
datos.fillna({"Age": 10, "Salary": 111111})

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,111111.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,10.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [126]:
datos

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## Eliminar las filas con Nan

In [78]:
datos.dropna()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
5,France,35.0,58000.0,Yes
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# Codificar datos categoricos

In [161]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [162]:
from sklearn.compose import ColumnTransformer

In [163]:
labelencoder_X = LabelEncoder()

In [164]:
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])

In [165]:
X

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [166]:
ct = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(categories='auto'), [0])],   
    remainder='passthrough'                        
)

In [167]:
ct

ColumnTransformer(remainder='passthrough',
                  transformers=[('one_hot_encoder', OneHotEncoder(), [0])])

In [174]:
X = np.array(ct.fit_transform(X), dtype='int')

In [175]:
X

array([[    1,     0,     1,     0,     1,     0,     0,    44, 72000],
       [    0,     1,     0,     1,     0,     0,     1,    27, 48000],
       [    0,     1,     0,     1,     0,     1,     0,    30, 54000],
       [    0,     1,     0,     1,     0,     0,     1,    38, 61000],
       [    0,     1,     0,     1,     0,     1,     0,    40, 63777],
       [    1,     0,     1,     0,     1,     0,     0,    35, 58000],
       [    0,     1,     0,     1,     0,     0,     1,    38, 52000],
       [    1,     0,     1,     0,     1,     0,     0,    48, 79000],
       [    0,     1,     0,     1,     0,     1,     0,    50, 83000],
       [    1,     0,     1,     0,     1,     0,     0,    37, 67000]])

In [176]:
labelencoder_y = LabelEncoder()

In [177]:
y = labelencoder_y.fit_transform(y)

In [178]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])