In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
 
from matplotlib import pyplot as plt
 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('new_data.csv')
data.head(5)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,address,z_address,bathrooms,bedrooms,finishedsqft,lastsolddate,lastsoldprice,latitude,longitude,neighborhood,totalrooms,usecode,yearbuilt,zestimate,zindexvalue,zipcode,zpid
0,0,2,Address: 1160 Mission Street #2007,1160 Mission St UNIT 2007,2.0,2.0,1043.0,02/17/2016,1300000.0,37.778705,-122.412635,South of Market,4,Condominium,2007.0,1167508.0,975700,94103.0,83152781.0
1,1,5,Address: 260 King Street #475,260 King St UNIT 475,1.0,1.0,903.0,02/17/2016,750000.0,37.777641,-122.393417,South of Market,3,Condominium,2004.0,823719.0,975700,94107.0,69819817.0
2,2,7,Address: 560 Missouri Street #B,560 Missouri St # B,4.0,3.0,1425.0,02/17/2016,1495000.0,37.759198,-122.396516,Potrero Hill,6,Condominium,2003.0,1708594.0,1277600,94107.0,64972847.0
3,3,9,Address: 350 Missouri Street,350 Missouri St,3.0,3.0,2231.0,02/17/2016,2700000.0,37.761886,-122.396769,Potrero Hill,10,SingleFamily,1927.0,2411236.0,1277600,94107.0,15149489.0
4,4,11,Address: 3658 Folsom Street,3658 Folsom St,3.0,3.0,1300.0,02/17/2016,1530000.0,37.740795,-122.413453,Bernal Heights,4,SingleFamily,1900.0,1918539.0,1248000,94110.0,15161978.0


In [3]:
data.shape

(11330, 19)

Будем использовать линейную регрессию для тестирования качества данных

In [4]:
def get_score(X,y, random_seed=42, model=None):
  if model is None:
    model = LinearRegression()
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_seed )  
  model.fit(X_train, y_train)
  return model.score(X_test, y_test)

Для поиска выбросов будем использовать boxplot

In [5]:
def get_boxplot(X, columns=[]):
  for i in columns:
    sns.boxplot(x=X[i])
  pass  

def get_pairplot(X, columns=None):
  if columns is None:
    columns = list(X.columns)
  sns.pairplot(X[columns])
  pass    

Для заполнения пропущенных значений будем использовать стандартные способы
- заполнение значением
- max
- min
- mode
- median
- mean
- метод индикатора
- линейная регрессия

In [6]:
def get_value(X, column, mode='mean', value=0, columns_for_reg=None):
  if mode == 'value':
    X.loc[X[X[column].isna()].index ,column] = value
  elif mode == 'max':
    X.loc[X[X[column].isna()].index ,column] = X[column].max()
  elif mode == 'min':
    X.loc[X[X[column].isna()].index ,column] = X[column].min()
  elif mode == 'median':
    X.loc[X[X[column].isna()].index ,column] = X[column].median()    
  elif mode == 'mode':
    X.loc[X[X[column].isna()].index, column] = X[column].mode()[0]  
  elif mode == 'indicator':
    X.loc[X[X[column].isna()].index, column] = 0
    X['ind_'+str(column)] = 0
    X.loc[X[X[column].isna()].index, 'ind_'+str(column)] = 1
  elif mode == 'linreg':
    if columns_for_reg is None:
      cols = list(X.select_dtypes([np.number]).columns) 
      cols.remove(column)
    else:
      cols = columns_for_reg
    X_tmp = X.dropna()  
    m = LinearRegression().fit(X_tmp[cols], X_tmp[column])
    X.loc[X[X[column].isna()].index, column] = m.predict(X[X[column].isna()][cols])
  else:
    X.loc[X[X[column].isna()].index, column] = X[column].mean()
  return X  


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11330 entries, 0 to 11329
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     11330 non-null  int64  
 1   Unnamed: 0.1   11330 non-null  int64  
 2   address        11330 non-null  object 
 3   z_address      11330 non-null  object 
 4   bathrooms      11330 non-null  float64
 5   bedrooms       11330 non-null  float64
 6   finishedsqft   11330 non-null  float64
 7   lastsolddate   11330 non-null  object 
 8   lastsoldprice  11330 non-null  float64
 9   latitude       11330 non-null  float64
 10  longitude      11330 non-null  float64
 11  neighborhood   11330 non-null  object 
 12  totalrooms     11330 non-null  int64  
 13  usecode        11330 non-null  object 
 14  yearbuilt      11330 non-null  float64
 15  zestimate      11330 non-null  float64
 16  zindexvalue    11330 non-null  object 
 17  zipcode        11330 non-null  float64
 18  zpid  

In [8]:
data.describe()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,bathrooms,bedrooms,finishedsqft,lastsoldprice,latitude,longitude,totalrooms,yearbuilt,zestimate,zipcode,zpid
count,11330.0,11330.0,11330.0,11330.0,11330.0,11330.0,11330.0,11330.0,11330.0,11330.0,11330.0,11330.0,11330.0
mean,5664.5,9171.729214,1.980229,2.614475,1586.962556,1263928.0,37.75971,-122.436517,6.112268,1948.498147,1565695.0,94116.912004,36899730.0
std,3270.833609,4921.941074,1.047358,1.299457,923.45602,1042079.0,0.025578,0.030743,12.125303,37.911196,1229417.0,9.400877,78007410.0
min,0.0,2.0,0.5,0.0,1.0,535.0,37.70817,-122.510726,1.0,1860.0,432385.0,94102.0,15063290.0
25%,2832.25,5039.75,1.0,2.0,1020.0,729250.0,37.739286,-122.455157,4.0,1916.0,905237.5,94110.0,15108470.0
50%,5664.5,9198.5,2.0,2.0,1364.5,990000.0,37.760508,-122.43251,5.0,1940.0,1230758.0,94115.0,15156970.0
75%,8496.75,13374.75,2.0,3.0,1879.0,1450000.0,37.781386,-122.413367,7.0,1986.0,1731170.0,94123.0,59700400.0
max,11329.0,17632.0,14.0,20.0,27275.0,23889000.0,37.806083,-122.381201,1264.0,2016.0,15533250.0,94158.0,2146999000.0


In [None]:
data.columns

Index(['Unnamed: 0', 'address', 'z_address', 'bathrooms', 'bedrooms',
       'finishedsqft', 'lastsolddate', 'lastsoldprice', 'latitude',
       'longitude', 'neighborhood', 'totalrooms', 'usecode', 'yearbuilt',
       'zestimate', 'zindexvalue', 'zipcode', 'zpid'],
      dtype='object')

In [9]:
cols = ['Unnamed: 0', 'Unnamed: 0.1', 'bathrooms', 'bedrooms',
       'finishedsqft', 'lastsoldprice', 'latitude',
       'longitude', 'totalrooms','yearbuilt','zipcode']
cols2 = ['Unnamed: 0', 'Unnamed: 0.1', 'bathrooms', 'bedrooms',
       'finishedsqft', 'latitude',
       'longitude', 'totalrooms','yearbuilt','zipcode']

In [10]:
data.dropna().shape

(11330, 19)

In [11]:
get_score(data.dropna()[cols], data.dropna()['zestimate'])

0.8605128012690781

In [12]:
get_score(data.dropna()[cols2], data.dropna()['zestimate'])

0.6556945314107413

In [13]:
for_drop = ['Unnamed: 0', 'Unnamed: 0.1', 'address', 'zindexvalue', 'zpid']
target = ['zestimate']

In [15]:
data['neighborhood'].value_counts()

Mission             540
Bernal Heights      525
South of Market     524
South Beach         461
Pacific Heights     439
                   ... 
North Beach          31
Lakeside             29
Little Hollywood     12
North Waterfront      8
Daly City             3
Name: neighborhood, Length: 71, dtype: int64

In [16]:
for i, el in data[data['latitude'].isna()].iterrows():
  data.loc[i, 'longitude'] = data[data['neighborhood'] == el['neighborhood']]['longitude'].mean()
  data.loc[i, 'latitude'] = data[data['neighborhood'] == el['neighborhood']]['latitude'].mean()


In [17]:
data[data.totalrooms - (data.bedrooms + data.bathrooms) < 0]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,address,z_address,bathrooms,bedrooms,finishedsqft,lastsolddate,lastsoldprice,latitude,longitude,neighborhood,totalrooms,usecode,yearbuilt,zestimate,zindexvalue,zipcode,zpid
2,2,7,Address: 560 Missouri Street #B,560 Missouri St # B,4.00,3.0,1425.0,02/17/2016,1495000.0,37.759198,-122.396516,Potrero Hill,6,Condominium,2003.0,1708594.0,1277600,94107.0,64972847.0
4,4,11,Address: 3658 Folsom Street,3658 Folsom St,3.00,3.0,1300.0,02/17/2016,1530000.0,37.740795,-122.413453,Bernal Heights,4,SingleFamily,1900.0,1918539.0,1248000,94110.0,15161978.0
39,39,74,Address: 2455 14th Avenue,2455 14th Ave,3.75,4.0,1846.0,02/10/2016,1525000.0,37.742271,-122.470518,West Portal,5,SingleFamily,1932.0,1195679.0,1565000,94116.0,15122857.0
46,46,93,Address: 784 Spruce Street,784-786 Spruce St,3.00,8.0,2600.0,02/09/2016,1535000.0,37.780606,-122.452663,Lone Mountain,10,MultiFamily2To4,1927.0,1524759.0,1447700,94118.0,15083824.0
60,60,121,Address: 2017 Buchanan Street,2017 Buchanan St,4.00,4.0,2850.0,02/05/2016,2800000.0,37.788913,-122.430728,Lower Pacific Heights,7,Condominium,1890.0,3032479.0,1178700,94115.0,21699068.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11255,11255,17466,Address: 560 Missouri Street #B,560 Missouri St # B,4.00,3.0,1425.0,02/17/2016,1495000.0,37.759198,-122.396516,Potrero Hill,6,Condominium,2003.0,1685676.0,1264000,94107.0,64972847.0
11257,11257,17470,Address: 3658 Folsom Street,3658 Folsom St,3.00,3.0,1300.0,02/17/2016,1530000.0,37.740795,-122.413453,Bernal Heights,4,SingleFamily,1900.0,1899916.0,1214400,94110.0,15161978.0
11292,11292,17533,Address: 2455 14th Avenue,2455 14th Ave,3.75,4.0,1846.0,02/10/2016,1525000.0,37.742271,-122.470518,West Portal,5,SingleFamily,1932.0,1228368.0,1551600,94116.0,15122857.0
11299,11299,17552,Address: 784 Spruce Street,784-786 Spruce St,3.00,8.0,2600.0,02/09/2016,1535000.0,37.780606,-122.452663,Lone Mountain,10,MultiFamily2To4,1927.0,1760133.0,1419600,94118.0,15083824.0


In [18]:
get_score(data.dropna()[['finishedsqft']], data.dropna()['zestimate'])

0.5695437259604978

In [20]:
data['totalrooms'] = data['totalrooms'].apply(lambda x: round(x))

In [21]:
data['usecode'].value_counts(dropna=False) 

SingleFamily        5785
Condominium         4782
MultiFamily2To4      484
Duplex               146
Townhouse             66
other                 40
Miscellaneous         17
Cooperative            3
Apartment              3
Mobile                 2
MultiFamily5Plus       2
Name: usecode, dtype: int64

In [22]:
_ = get_value(data, 'usecode', mode='value', value='other')

In [23]:
data.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,address,z_address,bathrooms,bedrooms,finishedsqft,lastsolddate,lastsoldprice,latitude,longitude,neighborhood,totalrooms,usecode,yearbuilt,zestimate,zindexvalue,zipcode,zpid
0,0,2,Address: 1160 Mission Street #2007,1160 Mission St UNIT 2007,2.0,2.0,1043.0,02/17/2016,1300000.0,37.778705,-122.412635,South of Market,4,Condominium,2007.0,1167508.0,975700,94103.0,83152781.0
1,1,5,Address: 260 King Street #475,260 King St UNIT 475,1.0,1.0,903.0,02/17/2016,750000.0,37.777641,-122.393417,South of Market,3,Condominium,2004.0,823719.0,975700,94107.0,69819817.0


In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11330 entries, 0 to 11329
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     11330 non-null  int64  
 1   Unnamed: 0.1   11330 non-null  int64  
 2   address        11330 non-null  object 
 3   z_address      11330 non-null  object 
 4   bathrooms      11330 non-null  float64
 5   bedrooms       11330 non-null  float64
 6   finishedsqft   11330 non-null  float64
 7   lastsolddate   11330 non-null  object 
 8   lastsoldprice  11330 non-null  float64
 9   latitude       11330 non-null  float64
 10  longitude      11330 non-null  float64
 11  neighborhood   11330 non-null  object 
 12  totalrooms     11330 non-null  int64  
 13  usecode        11330 non-null  object 
 14  yearbuilt      11330 non-null  float64
 15  zestimate      11330 non-null  float64
 16  zindexvalue    11330 non-null  object 
 17  zipcode        11330 non-null  float64
 18  zpid  

In [25]:
data.to_csv('new_data.csv')

 ## Вторая часть

In [26]:
data = pd.read_csv('new_data.csv')

In [27]:
def get_one_hot(X, cols):
    for each in cols:
        dummies = pd.get_dummies(X[each], prefix=each, drop_first=False)
        X = pd.concat([X, dummies], axis=1)
    return X

def get_woe_v1(X, col, target_col):
    all_one = X[col].value_counts() / X[col].value_counts()
    all_good = len(X[X[target_col] == 1][col])
    all_bad = len(X[X[target_col] == 0][col])
    odds_series = (
        (((all_one*X[df_train[target_col] == 1][col].value_counts()).fillna(0)+0.5)/all_good)
        /
        (((all_one*X[X[target_col] == 0][col].value_counts()).fillna(0)+0.5)/all_bad)
    )
    category_woe_dict = np.log(odds_series).to_dict()
    X[col + '_woe'] = X[col].apply(category_woe_dict.get)
    return X

def get_woe_cols(X, cols, target_col):
    for col in cols:
        X = get_woe_v1(X, col, target_col)
    return X    



In [28]:
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,address,z_address,bathrooms,bedrooms,finishedsqft,lastsolddate,lastsoldprice,latitude,longitude,neighborhood,totalrooms,usecode,yearbuilt,zestimate,zindexvalue,zipcode,zpid
0,0,0,2,Address: 1160 Mission Street #2007,1160 Mission St UNIT 2007,2.0,2.0,1043.0,02/17/2016,1300000.0,37.778705,-122.412635,South of Market,4,Condominium,2007.0,1167508.0,975700,94103.0,83152781.0
1,1,1,5,Address: 260 King Street #475,260 King St UNIT 475,1.0,1.0,903.0,02/17/2016,750000.0,37.777641,-122.393417,South of Market,3,Condominium,2004.0,823719.0,975700,94107.0,69819817.0
2,2,2,7,Address: 560 Missouri Street #B,560 Missouri St # B,4.0,3.0,1425.0,02/17/2016,1495000.0,37.759198,-122.396516,Potrero Hill,6,Condominium,2003.0,1708594.0,1277600,94107.0,64972847.0
3,3,3,9,Address: 350 Missouri Street,350 Missouri St,3.0,3.0,2231.0,02/17/2016,2700000.0,37.761886,-122.396769,Potrero Hill,10,SingleFamily,1927.0,2411236.0,1277600,94107.0,15149489.0
4,4,4,11,Address: 3658 Folsom Street,3658 Folsom St,3.0,3.0,1300.0,02/17/2016,1530000.0,37.740795,-122.413453,Bernal Heights,4,SingleFamily,1900.0,1918539.0,1248000,94110.0,15161978.0


In [29]:
columns = [
    'bathrooms',
    'bedrooms',
    'finishedsqft',
    'latitude',
    'longitude',
    'totalrooms'
]

In [30]:
get_score(data[columns], data['zestimate'])

0.6462356869965434

In [31]:
data_processed = data.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])

In [32]:
data_processed = get_one_hot(data_processed, cols=['usecode'])
del data_processed['usecode']
data_processed.head()

Unnamed: 0,Unnamed: 0.1.1,address,z_address,bathrooms,bedrooms,finishedsqft,lastsolddate,lastsoldprice,latitude,longitude,neighborhood,totalrooms,yearbuilt,zestimate,zindexvalue,zipcode,zpid,usecode_Apartment,usecode_Condominium,usecode_Cooperative,usecode_Duplex,usecode_Miscellaneous,usecode_Mobile,usecode_MultiFamily2To4,usecode_MultiFamily5Plus,usecode_SingleFamily,usecode_Townhouse,usecode_other
0,2,Address: 1160 Mission Street #2007,1160 Mission St UNIT 2007,2.0,2.0,1043.0,02/17/2016,1300000.0,37.778705,-122.412635,South of Market,4,2007.0,1167508.0,975700,94103.0,83152781.0,0,1,0,0,0,0,0,0,0,0,0
1,5,Address: 260 King Street #475,260 King St UNIT 475,1.0,1.0,903.0,02/17/2016,750000.0,37.777641,-122.393417,South of Market,3,2004.0,823719.0,975700,94107.0,69819817.0,0,1,0,0,0,0,0,0,0,0,0
2,7,Address: 560 Missouri Street #B,560 Missouri St # B,4.0,3.0,1425.0,02/17/2016,1495000.0,37.759198,-122.396516,Potrero Hill,6,2003.0,1708594.0,1277600,94107.0,64972847.0,0,1,0,0,0,0,0,0,0,0,0
3,9,Address: 350 Missouri Street,350 Missouri St,3.0,3.0,2231.0,02/17/2016,2700000.0,37.761886,-122.396769,Potrero Hill,10,1927.0,2411236.0,1277600,94107.0,15149489.0,0,0,0,0,0,0,0,0,1,0,0
4,11,Address: 3658 Folsom Street,3658 Folsom St,3.0,3.0,1300.0,02/17/2016,1530000.0,37.740795,-122.413453,Bernal Heights,4,1900.0,1918539.0,1248000,94110.0,15161978.0,0,0,0,0,0,0,0,0,1,0,0


In [33]:
data_processed.columns

Index(['Unnamed: 0.1.1', 'address', 'z_address', 'bathrooms', 'bedrooms',
       'finishedsqft', 'lastsolddate', 'lastsoldprice', 'latitude',
       'longitude', 'neighborhood', 'totalrooms', 'yearbuilt', 'zestimate',
       'zindexvalue', 'zipcode', 'zpid', 'usecode_Apartment',
       'usecode_Condominium', 'usecode_Cooperative', 'usecode_Duplex',
       'usecode_Miscellaneous', 'usecode_Mobile', 'usecode_MultiFamily2To4',
       'usecode_MultiFamily5Plus', 'usecode_SingleFamily', 'usecode_Townhouse',
       'usecode_other'],
      dtype='object')

In [34]:
exclude_columns = ['address', 'z_address', 'lastsoldprice', 'zestimate', 'lastsolddate', 'zindexvalue', 'zpid', 'neighborhood']

In [35]:
get_score(data_processed.drop(columns=exclude_columns), data_processed['zestimate'])

0.6756336241915848

In [36]:
get_score(get_one_hot(data_processed, cols=['neighborhood']).drop(columns=exclude_columns), data_processed['zestimate'])

0.7503237716340019

In [37]:
data_processed = get_one_hot(data_processed, cols=['neighborhood'])

In [38]:
data_processed.head()

Unnamed: 0,Unnamed: 0.1.1,address,z_address,bathrooms,bedrooms,finishedsqft,lastsolddate,lastsoldprice,latitude,longitude,neighborhood,totalrooms,yearbuilt,zestimate,zindexvalue,zipcode,zpid,usecode_Apartment,usecode_Condominium,usecode_Cooperative,usecode_Duplex,usecode_Miscellaneous,usecode_Mobile,usecode_MultiFamily2To4,usecode_MultiFamily5Plus,usecode_SingleFamily,usecode_Townhouse,usecode_other,neighborhood_Bayview,neighborhood_Bernal Heights,neighborhood_Buena Vista Park,neighborhood_Central Richmond,neighborhood_Central Sunset,neighborhood_Central Waterfront - Dogpatch,neighborhood_Corona Heights,neighborhood_Cow Hollow,neighborhood_Crocker Amazon,neighborhood_Daly City,neighborhood_Diamond Heights,neighborhood_Downtown,...,neighborhood_Lower Pacific Heights,neighborhood_Marina,neighborhood_Merced Heights,neighborhood_Midtown Terrace,neighborhood_Miraloma Park,neighborhood_Mission,neighborhood_Mission Terrace,neighborhood_Mount Davidson Manor,neighborhood_Nob Hill,neighborhood_Noe Valley,neighborhood_North Beach,neighborhood_North Panhandle,neighborhood_North Waterfront,neighborhood_Oceanview,neighborhood_Outer Mission,neighborhood_Outer Parkside,neighborhood_Outer Richmond,neighborhood_Outer Sunset,neighborhood_Pacific Heights,neighborhood_Parkside,neighborhood_Parnassus - Ashbury,neighborhood_Portola,neighborhood_Potrero Hill,neighborhood_Presidio Heights,neighborhood_Russian Hill,neighborhood_Sea Cliff,neighborhood_Silver Terrace,neighborhood_South Beach,neighborhood_South of Market,neighborhood_St. Francis Wood,neighborhood_Sunnyside,neighborhood_Telegraph Hill,neighborhood_Twin Peaks,neighborhood_Van Ness - Civic Center,neighborhood_Visitacion Valley,neighborhood_West Portal,neighborhood_Western Addition,neighborhood_Westwood Highlands,neighborhood_Westwood Park,neighborhood_Yerba Buena
0,2,Address: 1160 Mission Street #2007,1160 Mission St UNIT 2007,2.0,2.0,1043.0,02/17/2016,1300000.0,37.778705,-122.412635,South of Market,4,2007.0,1167508.0,975700,94103.0,83152781.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,5,Address: 260 King Street #475,260 King St UNIT 475,1.0,1.0,903.0,02/17/2016,750000.0,37.777641,-122.393417,South of Market,3,2004.0,823719.0,975700,94107.0,69819817.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,7,Address: 560 Missouri Street #B,560 Missouri St # B,4.0,3.0,1425.0,02/17/2016,1495000.0,37.759198,-122.396516,Potrero Hill,6,2003.0,1708594.0,1277600,94107.0,64972847.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,9,Address: 350 Missouri Street,350 Missouri St,3.0,3.0,2231.0,02/17/2016,2700000.0,37.761886,-122.396769,Potrero Hill,10,1927.0,2411236.0,1277600,94107.0,15149489.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,11,Address: 3658 Folsom Street,3658 Folsom St,3.0,3.0,1300.0,02/17/2016,1530000.0,37.740795,-122.413453,Bernal Heights,4,1900.0,1918539.0,1248000,94110.0,15161978.0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [39]:
get_score(data_processed.drop(columns=exclude_columns), data_processed['zestimate'])

0.7503237716340019

In [40]:
data_processed['lastsoldmonth'] = [t.month for t in pd.to_datetime(data['lastsolddate'])]

In [41]:
data_processed['lastsolddate'] = pd.to_datetime(data_processed['lastsolddate'])

In [42]:
data_processed['lastsolddate'] = [t.timestamp() for t in data_processed['lastsolddate']]

In [43]:
exclude_columns = ['address',
 'z_address',
 'lastsoldprice',
 'zestimate',
 'zindexvalue',
 'zpid',
 'neighborhood']

In [44]:
data_processed['lastsolddate'] -= data_processed['lastsolddate'].mean()
data_processed['lastsolddate'] /= data_processed['lastsolddate'].std()

In [45]:
data_processed['lastsolddate']

0        1.230869
1        1.230869
2        1.230869
3        1.230869
4        1.230869
           ...   
11325    1.204338
11326    1.202443
11327    1.202443
11328    1.202443
11329    1.200548
Name: lastsolddate, Length: 11330, dtype: float64

In [46]:
data_processed['lastsoldmonth']

0        2
1        2
2        2
3        2
4        2
        ..
11325    2
11326    2
11327    2
11328    2
11329    2
Name: lastsoldmonth, Length: 11330, dtype: int64

In [47]:
get_score(data_processed.drop(columns=exclude_columns), data_processed['zestimate'])

0.7498217414517561

In [48]:
for month in data_processed['lastsoldmonth'].unique():
    print(month)
    print(data_processed[data_processed['lastsoldmonth'] == month]['zestimate'].count())
    print(data_processed[data_processed['lastsoldmonth'] == month]['zestimate'].mean())
    print(data_processed[data_processed['lastsoldmonth'] == month]['zestimate'].std())

2
817
1502253.364749082
1092435.3831177324
8
1165
1503064.9261802575
1109609.7229528266
6
605
1571185.1933884297
1164659.359867279
1
759
1493958.184453228
1132562.6507245207
3
863
1653870.4716106604
1366803.147227975
11
1088
1575785.7858455882
1229419.3377919737
12
988
1622167.2358299596
1380084.9981488846
7
1075
1534187.913488372
1276952.7007302358
10
1433
1631256.240055827
1293183.5300640482
9
983
1433455.92878942
939399.9710280234
5
682
1628287.4164222875
1251367.900759141
4
872
1634824.0699541285
1380372.8745794734


In [49]:
get_score(get_one_hot(data_processed, cols=['lastsoldmonth']).drop(columns=exclude_columns), data_processed['zestimate'])

0.749833691994559

In [50]:
get_score(get_one_hot(data_processed[['lastsoldmonth']], cols=['lastsoldmonth']), data_processed['zestimate'])

0.0032757813975679007

In [51]:
get_score(get_one_hot(data_processed, cols=['zipcode']).drop(columns=exclude_columns), data_processed['zestimate'])

0.7559962465551118

## Создадим свои районы

In [52]:
from sklearn.cluster import DBSCAN, KMeans

In [53]:
len(data_processed['zipcode'].unique())

25

In [54]:
kmeans = KMeans(n_clusters=25)
cluster = kmeans.fit_predict(data_processed[['latitude', 'longitude']])

In [55]:
data_processed['cluster'] = cluster

In [56]:
get_score(data_processed.drop(columns=exclude_columns), data_processed['zestimate'])

0.7497802267671136