In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [102]:
# importamos los datos a trabajar para el entrenamiento
data = pd.read_parquet('train.parquet')

In [106]:
# verificamos los valores nulos en el archivo
data.isnull().sum()

id                              0
url                             0
region                          0
region_url                      0
price                           0
type                            0
sqfeet                          0
beds                            0
baths                           0
cats_allowed                    0
dogs_allowed                    0
smoking_allowed                 0
wheelchair_access               0
electric_vehicle_charge         0
comes_furnished                 0
laundry_options             71171
parking_options            126682
image_url                       0
description                     2
lat                          1722
long                         1722
state                           0
dtype: int64

In [4]:
# para no eliminar datos que puedan ser importantes lo cambiamos los archivos none por sin dato en las columnas que tienen el dato none
data['laundry_options'] = data['laundry_options'].replace({None: 'Sin dato'})
data['parking_options'] = data['parking_options'].replace({None: 'Sin dato'})

In [5]:
data.isnull().sum()

id                            0
url                           0
region                        0
region_url                    0
price                         0
type                          0
sqfeet                        0
beds                          0
baths                         0
cats_allowed                  0
dogs_allowed                  0
smoking_allowed               0
wheelchair_access             0
electric_vehicle_charge       0
comes_furnished               0
laundry_options               0
parking_options               0
image_url                     0
description                   2
lat                        1722
long                       1722
state                         0
dtype: int64

In [6]:
# usamos la columna price para discretizar datos y colocarlos en una lista
data1 = list(data['price'])
category_price = []

for i in data1:
    if i in range(0,1000):
        category_price.append('low')
    elif i in range(1000, 2000):
        category_price.append('medium')
    elif (i >= 2000):
        category_price.append('high')
    else:
        category_price.append(None)

In [46]:
# asignamos la columna category_price y verificamos la presencia de los valores low en la columna
data = data.assign(category_price = category_price)
data['category_price'] = np.where(data['category_price'] == 'low', 1, 0)

In [47]:
# usamos las columnas a transformar para el entrenamiento
new_data = data[['type', 'sqfeet', 'beds', 'baths', 'cats_allowed', 'dogs_allowed', 'smoking_allowed', 'wheelchair_access', 'electric_vehicle_charge', 'comes_furnished', 'laundry_options', 'parking_options', 'category_price']]

In [63]:
new_data =pd.get_dummies(new_data, columns=['type', 'laundry_options', 'parking_options'])

In [65]:
# guardamos los datos de caracteristicas a excepcion de la columna a predecir
X = new_data.drop('category_price', axis=1)

In [66]:
# guardamos los datos que queremos predecir despues del testeo
y = new_data['category_price']

In [67]:
# importamos el train_test_split para el entrenamiento de los datos con un 80% de datos como entrenamiento y el 20% como testeo
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.8)

In [68]:
# hacemos un shape para ver la cantidad de datos
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((277183, 35), (69296, 35), (277183,), (69296,))

In [69]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [70]:
pipe = Pipeline([('classifier', KNeighborsClassifier())])

In [71]:
search_space = [{'classifier': [LogisticRegression()], 
                 'classifier__penalty':['l1','l2'], 
                 'classifier__max_iter':[500, 1000]}, 
                {'classifier': [MLPClassifier()], 
                 'classifier__max_iter':[300, 400, 500], 
                 'classifier__hidden_layer_sizes': [(16, 16, 16), (16, 32, 24), (16, 24, 8)],
                 'classifier__activation': ['tanh', 'relu'],
                 'classifier__solver': ['sgd', 'adam'],
                 'classifier__alpha': [0.0001, 0.05],
                 'classifier__learning_rate': ['constate', 'adaptive']},
                {'classifier': [KNeighborsClassifier()],
                 'classifier__n_neighbors': [4,5,6,7,8,9,10],
                 'classifier__leaf_size': [1,3,5,7],
                 'classifier__algorithm':['auto', 'kd_tree']}]

In [72]:
clf = GridSearchCV(pipe, search_space, cv=5, verbose=0, n_jobs=4)

In [73]:
mejor_modelo = clf.fit(X_train, y_train)
print(mejor_modelo.best_estimator_.get_params()['classifier'])

370 fits failed out of a total of 1020.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\pc\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\pc\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\pipeline.py", line 406, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\pc\AppData\Local\Packages\PythonSoftwareFoundation.Python.3

KNeighborsClassifier(algorithm='kd_tree', leaf_size=5)


In [75]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(algorithm='kd_tree', leaf_size=5)

In [76]:
model.fit(X_train, y_train)

In [77]:
predicted = model.predict(X_test)

In [81]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
y_test_pred = model.predict(X_test)
y_test_prob = model.predict_proba(X_test)

auc = roc_auc_score(y_test, y_test_prob[:,1])
print('- Precision: ', round(precision_score(y_test, y_test_pred),2))
print('- Recall: ', round(recall_score(y_test, y_test_pred),2))
print('- F-score: ', round(f1_score(y_test, y_test_pred),2))
print('- AUC: ', round(auc,2))

- Precision:  0.82
- Recall:  0.82
- F-score:  0.82
- AUC:  0.91


In [82]:
# Trabajamos el programa con el archivo de testeo
data_test = pd.read_parquet('test.parquet')

In [83]:
# limpiamos los datos del archivo de testeo
data_test['laundry_options'] = data_test['laundry_options'].replace({None: 'Sin dato'})
data_test['parking_options'] = data_test['parking_options'].replace({None: 'Sin dato'})

In [125]:
# usamos las columnas a transformar para el entrenamiento
new_data_test = data_test[['type', 'sqfeet', 'beds', 'baths', 'cats_allowed', 'dogs_allowed', 'smoking_allowed', 'wheelchair_access', 'electric_vehicle_charge', 'comes_furnished', 'laundry_options', 'parking_options']]

In [126]:
new_data_test =pd.get_dummies(new_data_test, columns=['type', 'laundry_options', 'parking_options'])

In [128]:
# creamos dos columnas para tener las mismas cantidades de columnas en el data_test
new_data_test['type_assisted living'] = 0
new_data_test['type_land'] = 0 

In [127]:
new_data_test.insert(10,"type_assisted living",True)
new_data_test.insert(17,"type_land",True)

In [129]:
predicted_test = model.predict(new_data_test)

In [131]:
res = list(predicted_test)

In [132]:
predicted_test = pd.DataFrame(res, columns=['pred'])
predicted_test

Unnamed: 0,pred
0,0
1,1
2,0
3,0
4,0
...,...
38493,0
38494,1
38495,0
38496,0


In [133]:
# exportamos el df en un csv sin indice
predicted_test.to_csv('valdez101.csv', index = False)