# *PROYECTO ML - TEST*

Todos los cambios que se hagan en el NoteBook de "Proyecto ML - Train" deberán de verse reflejados de la misma forma en este
NoteBook. Por tanto, copiaremos y pegaremos los pasos del "Proyecto ML - Train" en este NoteBook.

In [1]:
import sqlite3
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import pickle

In [2]:
filename = '../modelos/RandomForestRegressor(prueba 3).sav'
loaded_model = pickle.load(open(filename, 'rb'))

In [3]:
def cat_var(df, cols):
    '''
    Return: a Pandas dataframe object with the following columns:
        - "categorical_variable" => every categorical variable include as an input parameter (string).
        - "number_of_possible_values" => the amount of unique values that can take a given categorical variable (integer).
        - "values" => a list with the posible unique values for every categorical variable (list).

    Input parameters:
        - df -> Pandas dataframe object: a dataframe with categorical variables.
        - cols -> list object: a list with the name (string) of every categorical variable to analyse.
    '''
    cat_list = []
    for col in cols:
        cat = df[col].unique()
        cat_num = len(cat)
        cat_dict = {"categorical_variable":col,
                    "number_of_possible_values":cat_num,
                    "values":cat}
        cat_list.append(cat_dict)
    df = pd.DataFrame(cat_list).sort_values(by="number_of_possible_values", ascending=False)
    return df.reset_index(drop=True)

In [4]:
test_diamonds = pd.read_csv('../data/diamonds_test.csv')
test_diamonds

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.20,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.90,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.90,Kimberly
4,4,0.50,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,Ideal,E,SI1,61.9,56.0,5.35,5.32,3.30,Amsterdam
13481,13481,0.71,Ideal,I,VS2,62.2,55.0,5.71,5.73,3.56,New York City
13482,13482,0.70,Ideal,F,VS1,61.6,55.0,5.75,5.71,3.53,Tel Aviv
13483,13483,0.70,Very Good,F,SI2,58.8,57.0,5.85,5.89,3.45,Surat


In [5]:
test_diamonds = test_diamonds.drop('id', axis=1)
test_diamonds

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,city
0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1.20,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,0.90,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.90,Kimberly
4,0.50,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam
...,...,...,...,...,...,...,...,...,...,...
13480,0.57,Ideal,E,SI1,61.9,56.0,5.35,5.32,3.30,Amsterdam
13481,0.71,Ideal,I,VS2,62.2,55.0,5.71,5.73,3.56,New York City
13482,0.70,Ideal,F,VS1,61.6,55.0,5.75,5.71,3.53,Tel Aviv
13483,0.70,Very Good,F,SI2,58.8,57.0,5.85,5.89,3.45,Surat


In [6]:
new_order = ['depth', 'table', 'x', 'y', 'z', 'carat', 'cut', 'color', 'clarity', 'city']

test_diamonds = test_diamonds[new_order]

In [7]:
test_diamonds

Unnamed: 0,depth,table,x,y,z,carat,cut,color,clarity,city
0,62.7,60.0,5.82,5.89,3.67,0.79,Very Good,F,SI1,Amsterdam
1,61.0,57.0,6.81,6.89,4.18,1.20,Ideal,J,VS1,Surat
2,62.2,61.0,7.38,7.32,4.57,1.57,Premium,H,SI1,Kimberly
3,63.8,54.0,6.09,6.13,3.90,0.90,Very Good,F,SI1,Kimberly
4,62.9,58.0,5.05,5.09,3.19,0.50,Very Good,F,VS1,Amsterdam
...,...,...,...,...,...,...,...,...,...,...
13480,61.9,56.0,5.35,5.32,3.30,0.57,Ideal,E,SI1,Amsterdam
13481,62.2,55.0,5.71,5.73,3.56,0.71,Ideal,I,VS2,New York City
13482,61.6,55.0,5.75,5.71,3.53,0.70,Ideal,F,VS1,Tel Aviv
13483,58.8,57.0,5.85,5.89,3.45,0.70,Very Good,F,SI2,Surat


In [8]:
cat_cols = ['cut', 'color', 'clarity', 'city']
diamantes_encoded = test_diamonds[cat_cols]
diamantes_encoded

Unnamed: 0,cut,color,clarity,city
0,Very Good,F,SI1,Amsterdam
1,Ideal,J,VS1,Surat
2,Premium,H,SI1,Kimberly
3,Very Good,F,SI1,Kimberly
4,Very Good,F,VS1,Amsterdam
...,...,...,...,...
13480,Ideal,E,SI1,Amsterdam
13481,Ideal,I,VS2,New York City
13482,Ideal,F,VS1,Tel Aviv
13483,Very Good,F,SI2,Surat


In [9]:
col_diamantes = list(diamantes_encoded.columns)
col_diamantes

['cut', 'color', 'clarity', 'city']

In [10]:
cat_diamantes = cat_var(diamantes_encoded, col_diamantes)
cat_diamantes

Unnamed: 0,categorical_variable,number_of_possible_values,values
0,city,13,"[Amsterdam, Surat, Kimberly, Paris, Tel Aviv, ..."
1,clarity,8,"[SI1, VS1, VS2, VVS1, SI2, VVS2, IF, I1]"
2,color,7,"[F, J, H, D, I, G, E]"
3,cut,5,"[Very Good, Ideal, Premium, Good, Fair]"


In [11]:
le = LabelEncoder()

In [12]:
for column in ['cut', 'color', 'clarity', 'city']:
    test_diamonds[column] = le.fit_transform(test_diamonds[column])

In [13]:
test_diamonds

Unnamed: 0,depth,table,x,y,z,carat,cut,color,clarity,city
0,62.7,60.0,5.82,5.89,3.67,0.79,4,2,2,0
1,61.0,57.0,6.81,6.89,4.18,1.20,2,6,4,10
2,62.2,61.0,7.38,7.32,4.57,1.57,3,4,2,3
3,63.8,54.0,6.09,6.13,3.90,0.90,4,2,2,3
4,62.9,58.0,5.05,5.09,3.19,0.50,4,2,4,0
...,...,...,...,...,...,...,...,...,...,...
13480,61.9,56.0,5.35,5.32,3.30,0.57,2,1,2,0
13481,62.2,55.0,5.71,5.73,3.56,0.71,2,5,5,8
13482,61.6,55.0,5.75,5.71,3.53,0.70,2,2,4,11
13483,58.8,57.0,5.85,5.89,3.45,0.70,4,2,3,10


In [14]:
final_test = test_diamonds.drop(columns = ['city'])

In [15]:
final_test

Unnamed: 0,depth,table,x,y,z,carat,cut,color,clarity
0,62.7,60.0,5.82,5.89,3.67,0.79,4,2,2
1,61.0,57.0,6.81,6.89,4.18,1.20,2,6,4
2,62.2,61.0,7.38,7.32,4.57,1.57,3,4,2
3,63.8,54.0,6.09,6.13,3.90,0.90,4,2,2
4,62.9,58.0,5.05,5.09,3.19,0.50,4,2,4
...,...,...,...,...,...,...,...,...,...
13480,61.9,56.0,5.35,5.32,3.30,0.57,2,1,2
13481,62.2,55.0,5.71,5.73,3.56,0.71,2,5,5
13482,61.6,55.0,5.75,5.71,3.53,0.70,2,2,4
13483,58.8,57.0,5.85,5.89,3.45,0.70,4,2,3


In [16]:
prediccion = loaded_model.predict(final_test)

In [17]:
prediccion

array([3010.036 , 5333.3188, 9173.5156, ..., 3016.378 , 2178.812 ,
        824.6672])

In [18]:
df_prediccion = pd.DataFrame(prediccion, columns = ['price'])

df_prediccion['id'] = df_prediccion.index
df_prediccion = df_prediccion [['id', 'price']]

In [19]:
df_prediccion.head()

Unnamed: 0,id,price
0,0,3010.036
1,1,5333.3188
2,2,9173.5156
3,3,4159.8604
4,4,1702.0048


In [20]:
df_prediccion.to_csv(r'../entregas/predicciónRandomForestRegressor(prueba 3).csv', index = False)