In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

plt.style.use('fivethirtyeight')
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('new_data1.csv', index_col=0, encoding='utf-8')

In [3]:
data.head()

Unnamed: 0,city,zipcode,state,status_ismissing,propertyType_ismissing,street_ismissing,city_ismissing,sqft_ismissing,target_ismissing,num_missing,...,Heating,Cooling,Parking,lotsize,Pricesqft2,rating1,Distance1,Grades1,target2,sqft2
2,LOSANGELES,90049,CA,False,False,False,False,False,False,0,...,ForcedAir,Central,AttachedGarage,8.0,965.0,8/10,1.19,6-8,2895000.0,3000.0
3,DALLAS,75205,TX,False,False,False,False,False,False,0,...,ForcedAir,Central,DetachedGarage,8.0,371.0,9/10,0.1,PK-4,2395000.0,6457.0
12,PEMBROKEPINES,33028,FL,False,False,False,False,False,False,0,...,ForcedAir,Central,Carport,10.0,185.0,7/10,0.92,PK-5,525000.0,2839.0
14,HOUSTON,77084,TX,True,False,False,False,False,False,1,...,Other,Central,2spaces,2.0,69.0,5/10,0.6,6-8,168800.0,2454.0
25,WASHINGTON,20009,DC,False,False,False,False,False,False,0,...,Radiant,Central,OffStreet,2.0,647.0,10/10,0.14,PK-5,3749000.0,5796.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74871 entries, 2 to 377180
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    74871 non-null  object 
 1   zipcode                 74871 non-null  int64  
 2   state                   74871 non-null  object 
 3   status_ismissing        74871 non-null  bool   
 4   propertyType_ismissing  74871 non-null  bool   
 5   street_ismissing        74871 non-null  bool   
 6   city_ismissing          74871 non-null  bool   
 7   sqft_ismissing          74871 non-null  bool   
 8   target_ismissing        74871 non-null  bool   
 9   num_missing             74871 non-null  int64  
 10  status2                 74871 non-null  object 
 11  propertyType2           74871 non-null  object 
 12  Yearbuilt               74871 non-null  int64  
 13  Remodeledyear           74871 non-null  int64  
 14  Heating                 74871 non-nul

In [5]:
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder 

In [None]:
X = data.drop(['target2',   'propertyType2',  'status2',    #'state',          
               'status_ismissing','propertyType_ismissing', 'street_ismissing', 'city_ismissing',
               'sqft_ismissing', 'target_ismissing', 'num_missing'], axis=1,)
y = data.target2.values.astype('int64')

In [25]:
# Колонки в которых находяться категориальные данные преобразуем через get_dummies
# Колонки в которых находяться числовые данные приведем  к типу float
def _prepare(df_input):
    df_output = df_input.copy()
    #labelencoder = LabelEncoder()
    dicts = {}
    _column =['Yearbuilt','rating1',  'Remodeledyear', 'Heating','Cooling', 'Parking',
              'Grades1', 'city','state','zipcode' ] 
    
    for _name in _column:
           print(_name)
           df_output = pd.get_dummies(df_output, columns=[_name], prefix=[_name+"_is",] ) 
    
    _column =['sqft2','lotsize','Pricesqft2','Distance1'] # 'baths2',
    for _name in _column:
            print(_name)
            df_output[_name] = df_output[_name].astype(float)
    
    return df_output

In [26]:
X1 = _prepare(X) # преобразуем для работы с моделью

In [27]:
X1.head()

Unnamed: 0,baths2,sqft2,lotsize,Pricesqft2,Distance1,Yearbuilt_is_1060,Yearbuilt_is_1735,Yearbuilt_is_1740,Yearbuilt_is_1750,Yearbuilt_is_1794,...,zipcode_is_99217,zipcode_is_99218,zipcode_is_99223,zipcode_is_99224,zipcode_is_99336,zipcode_is_99337,PrivatePool_is_No,PrivatePool_is_Yes,private pool_is_No,private pool_is_Yes
2,2.0,3000.0,8.0,965.0,1.19,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
3,8.0,6457.0,8.0,371.0,0.1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
6,0.0,1507.0,4.0,120.0,0.8,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
11,0.0,3130.0,5.0,83.0,0.4,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
12,3.0,2839.0,10.0,185.0,0.92,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0


Разделим на тренировочные и проверочные данные

In [28]:
X_train, X_valid, y_train, y_valid = train_test_split(X1, y, test_size=0.3, random_state=42, shuffle=True)

In [31]:
num_folds = 10 
seed = 42 
scoring = 'r2' #'neg_mean_squared_error'
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import f1_score, accuracy_score

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=42, splitter='best'): 
	train: -382.0902 
	valid: -237.6859


In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

model_dict = {
   'lr': LinearRegression(),
#    'lasso': Lasso(random_state=42),
#    'ridge': Ridge(random_state=42),
#    'enet': ElasticNet(random_state=42),
   'knn': KNeighborsRegressor(),
   'dt': DecisionTreeRegressor(random_state=42),
   'rf': RandomForestRegressor(random_state=42)
}

score_list = []

for model in model_dict:
    model_dict[model].fit(X_train_scaled, y_train)
    y_pred_train = model_dict[model].predict(X_train_scaled)
    y_pred_valid = model_dict[model].predict(X_valid_scaled)
    score_train = r2_score(y_train, y_pred_train)
    score_valid = r2_score(y_valid, y_pred_valid)
    score_list.append((score_train, score_valid))
    msg = "%s: %f (%f)" % (model, np.round(score_train, 4),np.round(score_valid, 4))    
    print(msg)

for model, scores in zip(model_dict.keys(), score_list):
    print(f'{model}: \n\ttrain: {np.round(scores[0], 4)} \n\tvalid: {np.round(scores[1], 4)}')

lr: 
	train: 0.4173 
	valid: -1.7333831902970583e+26
knn: 
	train: 0.5693 
	valid: 0.3578
dt: 
	train: 1.0 
	valid: 0.4659
rf: 
	train: 0.986 
	valid: 0.4874

In [None]:
ensembles = [] 
ensembles.append(('ScaledAB', Pipeline([('Scaler', StandardScaler()),('AB', AdaBoostRegressor())]))) 
ensembles.append(('ScaledGBM', Pipeline([('Scaler', StandardScaler()),('GBM', GradientBoostingRegressor(random_state=42))]))) 
#ensembles.append(('ScaledRF', Pipeline([('Scaler', StandardScaler()),('RF', RandomForestRegressor(random_state=42))]))) 
ensembles.append(('ScaledET', Pipeline([('Scaler', StandardScaler()),('ET', ExtraTreesRegressor(random_state=42))]))) 
results = [] 
names = [] 
for name, model in ensembles:    
    kfold = KFold(n_splits=num_folds, random_state=seed)    
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='r2') # scoring=scoring)    # scoring='r2'
    results.append(cv_results)    
    names.append(name)    
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())    
    print(msg)

ScaledAB: -2.177975 (2.519338)
ScaledGBM: 0.879227 (0.084676)
ScaledET: 0.854543 (0.086850)

In [None]:
Видим, что лучший результат получаем на  RandomForestRegressor(random_state=42)
попробуем еще подобрать параметры

In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

rf = RandomForestRegressor(random_state=42, n_jobs=-1)

random_grid = {
    'n_estimators': np.arange(200, 501, 20),
    'max_depth': np.arange(2, 51, 2),
    'max_features': [0.5, 0.6, 0.7, 0.8, 0.9],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10]
}

random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=random_grid,
    n_iter=50,
    scoring='r2',
    cv=10,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train_scaled, y_train)

print(random_search.best_score_)
print(random_search.best_params_)

0.7248443720581363
{'n_estimators': 400, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 0.5, 'max_depth': 26}

In [None]:
# измерим качество подобранной модели на отложенной выборке

y_pred_train = random_search.best_estimator_.predict(X_train_scaled)
y_pred_valid = random_search.best_estimator_.predict(X_valid_scaled)

score_train = r2_score(y_train, y_pred_train)
score_valid = r2_score(y_valid, y_pred_valid)

print(f'train: {np.round(score_train, 4)}\nvalid: {np.round(score_valid, 4)}')

train: 0.986 
valid: 0.4934

Сохраним полученную нами модель

In [None]:
import pickle
filename = 'final_model.sav'
pickle.dump(random_search.best_estimator_, open(filename, 'wb'))