In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
 
from matplotlib import pyplot as plt
 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [30]:
data = pd.read_csv('data/final_data.csv', delimiter=';')
data.head(5)

Unnamed: 0.1,Unnamed: 0,address,z_address,bathrooms,bedrooms,finishedsqft,lastsolddate,lastsoldprice,latitude,longitude,neighborhood,totalrooms,usecode,yearbuilt,zestimate,zindexvalue,zipcode,zpid
0,2,Address: 1160 Mission Street #2007,1160 Mission St UNIT 2007,2.0,2.0,1043.0,02/17/2016,1300000.0,37.778705,-122.412635,South of Market,4.0,Condominium,2007.0,1167508.0,975700,94103.0,83152781.0
1,5,Address: 260 King Street #475,260 King St UNIT 475,1.0,1.0,903.0,02/17/2016,750000.0,37.777641,-122.393417,South of Market,3.0,Condominium,2004.0,823719.0,975700,94107.0,69819817.0
2,7,Address: 560 Missouri Street #B,560 Missouri St # B,4.0,3.0,1425.0,02/17/2016,1495000.0,37.759198,-122.396516,Potrero Hill,6.0,Condominium,2003.0,1708594.0,1277600,94107.0,64972847.0
3,9,Address: 350 Missouri Street,350 Missouri St,3.0,3.0,2231.0,02/17/2016,2700000.0,37.761886,-122.396769,Potrero Hill,10.0,SingleFamily,1927.0,2411236.0,1277600,94107.0,15149489.0
4,11,Address: 3658 Folsom Street,3658 Folsom St,3.0,3.0,1300.0,02/17/2016,1530000.0,37.740795,-122.413453,Bernal Heights,4.0,SingleFamily,1900.0,1918539.0,1248000,94110.0,15161978.0


In [6]:
data.shape

(11330, 19)

Будем использовать линейную регрессию для тестирования качества данных

In [7]:
def get_score(X,y, random_seed=42, model=None):
  if model is None:
    model = LinearRegression()
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_seed )  
  model.fit(X_train, y_train)
  return model.score(X_test, y_test)

Для поиска выбросов будем использовать boxplot

In [8]:
def get_boxplot(X, columns=[]):
  for i in columns:
    sns.boxplot(x=X[i])
  pass  

def get_pairplot(X, columns=None):
  if columns is None:
    columns = list(X.columns)
  sns.pairplot(X[columns])
  pass    

Для заполнения пропущенных значений будем использовать стандартные способы
- заполнение значением
- max
- min
- mode
- median
- mean
- метод индикатора
- линейная регрессия

In [9]:
def get_value(X, column, mode='mean', value=0, columns_for_reg=None):
  if mode == 'value':
    X.loc[X[X[column].isna()].index ,column] = value
  elif mode == 'max':
    X.loc[X[X[column].isna()].index ,column] = X[column].max()
  elif mode == 'min':
    X.loc[X[X[column].isna()].index ,column] = X[column].min()
  elif mode == 'median':
    X.loc[X[X[column].isna()].index ,column] = X[column].median()    
  elif mode == 'mode':
    X.loc[X[X[column].isna()].index, column] = X[column].mode()[0]  
  elif mode == 'indicator':
    X.loc[X[X[column].isna()].index, column] = 0
    X['ind_'+str(column)] = 0
    X.loc[X[X[column].isna()].index, 'ind_'+str(column)] = 1
  elif mode == 'linreg':
    if columns_for_reg is None:
      cols = list(X.select_dtypes([np.number]).columns) 
      cols.remove(column)
    else:
      cols = columns_for_reg
    X_tmp = X.dropna()  
    m = LinearRegression().fit(X_tmp[cols], X_tmp[column])
    X.loc[X[X[column].isna()].index, column] = m.predict(X[X[column].isna()][cols])
  else:
    X.loc[X[X[column].isna()].index, column] = X[column].mean()
  return X  


In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11330 entries, 0 to 11329
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     11330 non-null  int64  
 1   address        11330 non-null  object 
 2   z_address      11330 non-null  object 
 3   bathrooms      11330 non-null  float64
 4   bedrooms       11330 non-null  float64
 5   finishedsqft   11330 non-null  float64
 6   lastsolddate   11330 non-null  object 
 7   lastsoldprice  11330 non-null  float64
 8   latitude       11315 non-null  float64
 9   longitude      11315 non-null  float64
 10  neighborhood   11330 non-null  object 
 11  totalrooms     11281 non-null  float64
 12  usecode        11290 non-null  object 
 13  yearbuilt      11330 non-null  float64
 14  zestimate      11330 non-null  float64
 15  zindexvalue    11330 non-null  object 
 16  zipcode        11330 non-null  float64
 17  zpid           11330 non-null  float64
dtypes: flo

In [15]:
data.describe()

Unnamed: 0.1,Unnamed: 0,bathrooms,bedrooms,finishedsqft,lastsoldprice,latitude,longitude,totalrooms,yearbuilt,zestimate,zipcode,zpid
count,11330.0,11330.0,11330.0,11211.0,11330.0,11315.0,11315.0,11281.0,11330.0,11330.0,11330.0,11330.0
mean,9171.729214,1.980229,2.614475,1586.081349,1263928.0,37.759715,-122.436508,6.113997,1948.498147,1565695.0,94116.912004,36899730.0
std,4921.941074,1.047358,1.299457,922.815803,1042079.0,0.025583,0.030742,12.151195,37.911196,1229417.0,9.400877,78007410.0
min,2.0,0.5,0.0,1.0,535.0,37.70817,-122.510726,1.0,1860.0,432385.0,94102.0,15063290.0
25%,5039.75,1.0,2.0,1019.0,729250.0,37.739284,-122.455149,4.0,1916.0,905237.5,94110.0,15108470.0
50%,9198.5,2.0,2.0,1362.0,990000.0,37.760514,-122.43251,5.0,1940.0,1230758.0,94115.0,15156970.0
75%,13374.75,2.0,3.0,1878.0,1450000.0,37.781386,-122.413353,7.0,1986.0,1731170.0,94123.0,59700400.0
max,17632.0,14.0,20.0,27275.0,23889000.0,37.806083,-122.381201,1264.0,2016.0,15533250.0,94158.0,2146999000.0


In [16]:
data.columns

Index(['Unnamed: 0', 'address', 'z_address', 'bathrooms', 'bedrooms',
       'finishedsqft', 'lastsolddate', 'lastsoldprice', 'latitude',
       'longitude', 'neighborhood', 'totalrooms', 'usecode', 'yearbuilt',
       'zestimate', 'zindexvalue', 'zipcode', 'zpid'],
      dtype='object')

In [17]:
cols = ['Unnamed: 0', 'bathrooms', 'bedrooms',
       'finishedsqft', 'lastsoldprice', 'latitude',
       'longitude', 'totalrooms','yearbuilt','zipcode']
cols2 = ['Unnamed: 0', 'bathrooms', 'bedrooms',
       'finishedsqft', 'latitude',
       'longitude', 'totalrooms','yearbuilt','zipcode']

In [18]:
data.dropna().shape

(11108, 18)

In [None]:
-             = 0.8375491685999106, 0.607156739569225
finishedsqft  = 0.8524615199933394, 0.5577844171230759
long and lat  = 0.8517111453083177, 0.6874289038247563
totalrooms    = 0.8598254992362179, 0.6426676296618906

In [None]:
get_score(data.dropna()[cols], data.dropna()['zestimate'])

0.8598254992362179

In [None]:
get_score(data.dropna()[cols2], data.dropna()['zestimate'])

0.6426676296618906

In [None]:
for_drop = ['Unnamed: 0', 'address', 'zindexvalue', 'zpid']
target = ['zestimate']

In [31]:

_ = get_value(data, 'finishedsqft', mode='linreg', value=0, columns_for_reg=['bathrooms', 'bedrooms', 'lastsoldprice','totalrooms'])

In [None]:
data['neighborhood'].value_counts()

Mission             540
Bernal Heights      525
South of Market     524
South Beach         461
Pacific Heights     439
                   ... 
North Beach          31
Lakeside             29
Little Hollywood     12
North Waterfront      8
Daly City             3
Name: neighborhood, Length: 71, dtype: int64

In [32]:
for i, el in data[data['latitude'].isna()].iterrows():
  data.loc[i, 'longitude'] = data[data['neighborhood'] == el['neighborhood']]['longitude'].mean()
  data.loc[i, 'latitude'] = data[data['neighborhood'] == el['neighborhood']]['latitude'].mean()


In [22]:
data[data.totalrooms - (data.bedrooms + data.bathrooms) < 0]

Unnamed: 0.1,Unnamed: 0,address,z_address,bathrooms,bedrooms,finishedsqft,lastsolddate,lastsoldprice,latitude,longitude,neighborhood,totalrooms,usecode,yearbuilt,zestimate,zindexvalue,zipcode,zpid
2,7,Address: 560 Missouri Street #B,560 Missouri St # B,4.00,3.0,1425.0,02/17/2016,1495000.0,37.759198,-122.396516,Potrero Hill,6.0,Condominium,2003.0,1708594.0,1277600,94107.0,64972847.0
4,11,Address: 3658 Folsom Street,3658 Folsom St,3.00,3.0,1300.0,02/17/2016,1530000.0,37.740795,-122.413453,Bernal Heights,4.0,SingleFamily,1900.0,1918539.0,1248000,94110.0,15161978.0
39,74,Address: 2455 14th Avenue,2455 14th Ave,3.75,4.0,1846.0,02/10/2016,1525000.0,37.742271,-122.470518,West Portal,5.0,SingleFamily,1932.0,1195679.0,1565000,94116.0,15122857.0
46,93,Address: 784 Spruce Street,784-786 Spruce St,3.00,8.0,2600.0,02/09/2016,1535000.0,37.780606,-122.452663,Lone Mountain,10.0,MultiFamily2To4,1927.0,1524759.0,1447700,94118.0,15083824.0
60,121,Address: 2017 Buchanan Street,2017 Buchanan St,4.00,4.0,2850.0,02/05/2016,2800000.0,37.788913,-122.430728,Lower Pacific Heights,7.0,Condominium,1890.0,3032479.0,1178700,94115.0,21699068.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11255,17466,Address: 560 Missouri Street #B,560 Missouri St # B,4.00,3.0,1425.0,02/17/2016,1495000.0,37.759198,-122.396516,Potrero Hill,6.0,Condominium,2003.0,1685676.0,1264000,94107.0,64972847.0
11257,17470,Address: 3658 Folsom Street,3658 Folsom St,3.00,3.0,1300.0,02/17/2016,1530000.0,37.740795,-122.413453,Bernal Heights,4.0,SingleFamily,1900.0,1899916.0,1214400,94110.0,15161978.0
11292,17533,Address: 2455 14th Avenue,2455 14th Ave,3.75,4.0,1846.0,02/10/2016,1525000.0,37.742271,-122.470518,West Portal,5.0,SingleFamily,1932.0,1228368.0,1551600,94116.0,15122857.0
11299,17552,Address: 784 Spruce Street,784-786 Spruce St,3.00,8.0,2600.0,02/09/2016,1535000.0,37.780606,-122.452663,Lone Mountain,10.0,MultiFamily2To4,1927.0,1760133.0,1419600,94118.0,15083824.0


In [28]:
get_score(data.dropna()[['finishedsqft']], data.dropna()['zestimate'])

0.5788785566874417

In [33]:

_ = get_value(data, 'totalrooms', mode='linreg', value=0, columns_for_reg=['bathrooms', 'bedrooms', 'lastsoldprice','finishedsqft'])

In [34]:
data['totalrooms'] = data['totalrooms'].apply(lambda x: round(x))

In [35]:
data['usecode'].value_counts(dropna=False) 

SingleFamily        5785
Condominium         4782
MultiFamily2To4      484
Duplex               146
Townhouse             66
NaN                   40
Miscellaneous         17
Apartment              3
Cooperative            3
Mobile                 2
MultiFamily5Plus       2
Name: usecode, dtype: int64

In [36]:
_ = get_value(data, 'usecode', mode='value', value='other')

In [37]:
data.head(2)

Unnamed: 0.1,Unnamed: 0,address,z_address,bathrooms,bedrooms,finishedsqft,lastsolddate,lastsoldprice,latitude,longitude,neighborhood,totalrooms,usecode,yearbuilt,zestimate,zindexvalue,zipcode,zpid
0,2,Address: 1160 Mission Street #2007,1160 Mission St UNIT 2007,2.0,2.0,1043.0,02/17/2016,1300000.0,37.778705,-122.412635,South of Market,4,Condominium,2007.0,1167508.0,975700,94103.0,83152781.0
1,5,Address: 260 King Street #475,260 King St UNIT 475,1.0,1.0,903.0,02/17/2016,750000.0,37.777641,-122.393417,South of Market,3,Condominium,2004.0,823719.0,975700,94107.0,69819817.0


In [41]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11330 entries, 0 to 11329
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     11330 non-null  int64  
 1   Unnamed: 0.1   11330 non-null  int64  
 2   address        11330 non-null  object 
 3   z_address      11330 non-null  object 
 4   bathrooms      11330 non-null  float64
 5   bedrooms       11330 non-null  float64
 6   finishedsqft   11330 non-null  float64
 7   lastsolddate   11330 non-null  object 
 8   lastsoldprice  11330 non-null  float64
 9   latitude       11330 non-null  float64
 10  longitude      11330 non-null  float64
 11  neighborhood   11330 non-null  object 
 12  totalrooms     11330 non-null  int64  
 13  usecode        11330 non-null  object 
 14  yearbuilt      11330 non-null  float64
 15  zestimate      11330 non-null  float64
 16  zindexvalue    11330 non-null  object 
 17  zipcode        11330 non-null  float64
 18  zpid  

In [39]:
data.to_csv('new_data.csv')

 ## Вторая часть

In [10]:
data = pd.read_csv('new_data.csv')

In [11]:
def get_one_hot(X, cols):
    for each in cols:
        dummies = pd.get_dummies(X[each], prefix=each, drop_first=False)
        X = pd.concat([X, dummies], axis=1)
    return X

def get_woe_v1(X, col, target_col):
    all_one = X[col].value_counts() / X[col].value_counts()
    all_good = len(X[X[target_col] == 1][col])
    all_bad = len(X[X[target_col] == 0][col])
    odds_series = (
        (((all_one*X[df_train[target_col] == 1][col].value_counts()).fillna(0)+0.5)/all_good)
        /
        (((all_one*X[X[target_col] == 0][col].value_counts()).fillna(0)+0.5)/all_bad)
    )
    category_woe_dict = np.log(odds_series).to_dict()
    X[col + '_woe'] = X[col].apply(category_woe_dict.get)
    return X

def get_woe_cols(X, cols, target_col):
    for col in cols:
        X = get_woe_v1(X, col, target_col)
    return X    



In [13]:
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,address,z_address,bathrooms,bedrooms,finishedsqft,lastsolddate,lastsoldprice,latitude,longitude,neighborhood,totalrooms,usecode,yearbuilt,zestimate,zindexvalue,zipcode,zpid
0,0,2,Address: 1160 Mission Street #2007,1160 Mission St UNIT 2007,2.0,2.0,1043.0,02/17/2016,1300000.0,37.778705,-122.412635,South of Market,4,Condominium,2007.0,1167508.0,975700,94103.0,83152781.0
1,1,5,Address: 260 King Street #475,260 King St UNIT 475,1.0,1.0,903.0,02/17/2016,750000.0,37.777641,-122.393417,South of Market,3,Condominium,2004.0,823719.0,975700,94107.0,69819817.0
2,2,7,Address: 560 Missouri Street #B,560 Missouri St # B,4.0,3.0,1425.0,02/17/2016,1495000.0,37.759198,-122.396516,Potrero Hill,6,Condominium,2003.0,1708594.0,1277600,94107.0,64972847.0
3,3,9,Address: 350 Missouri Street,350 Missouri St,3.0,3.0,2231.0,02/17/2016,2700000.0,37.761886,-122.396769,Potrero Hill,10,SingleFamily,1927.0,2411236.0,1277600,94107.0,15149489.0
4,4,11,Address: 3658 Folsom Street,3658 Folsom St,3.0,3.0,1300.0,02/17/2016,1530000.0,37.740795,-122.413453,Bernal Heights,4,SingleFamily,1900.0,1918539.0,1248000,94110.0,15161978.0


In [22]:
columns = [
    'bathrooms',
    'bedrooms',
    'finishedsqft',
    'latitude',
    'longitude',
    'totalrooms'
]

In [23]:
get_score(data[columns], data['zestimate'])

0.6462356869965435

In [41]:
data_processed = data.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])

In [42]:
data_processed = get_one_hot(data_processed, cols=['usecode'])
del data_processed['usecode']
data_processed.head()

Unnamed: 0,address,z_address,bathrooms,bedrooms,finishedsqft,lastsolddate,lastsoldprice,latitude,longitude,neighborhood,...,usecode_Condominium,usecode_Cooperative,usecode_Duplex,usecode_Miscellaneous,usecode_Mobile,usecode_MultiFamily2To4,usecode_MultiFamily5Plus,usecode_SingleFamily,usecode_Townhouse,usecode_other
0,Address: 1160 Mission Street #2007,1160 Mission St UNIT 2007,2.0,2.0,1043.0,02/17/2016,1300000.0,37.778705,-122.412635,South of Market,...,1,0,0,0,0,0,0,0,0,0
1,Address: 260 King Street #475,260 King St UNIT 475,1.0,1.0,903.0,02/17/2016,750000.0,37.777641,-122.393417,South of Market,...,1,0,0,0,0,0,0,0,0,0
2,Address: 560 Missouri Street #B,560 Missouri St # B,4.0,3.0,1425.0,02/17/2016,1495000.0,37.759198,-122.396516,Potrero Hill,...,1,0,0,0,0,0,0,0,0,0
3,Address: 350 Missouri Street,350 Missouri St,3.0,3.0,2231.0,02/17/2016,2700000.0,37.761886,-122.396769,Potrero Hill,...,0,0,0,0,0,0,0,1,0,0
4,Address: 3658 Folsom Street,3658 Folsom St,3.0,3.0,1300.0,02/17/2016,1530000.0,37.740795,-122.413453,Bernal Heights,...,0,0,0,0,0,0,0,1,0,0


In [29]:
data_processed.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'address', 'z_address', 'bathrooms',
       'bedrooms', 'finishedsqft', 'lastsolddate', 'lastsoldprice', 'latitude',
       'longitude', 'neighborhood', 'totalrooms', 'usecode', 'yearbuilt',
       'zestimate', 'zindexvalue', 'zipcode', 'zpid', 'usecode_Apartment',
       'usecode_Condominium', 'usecode_Cooperative', 'usecode_Duplex',
       'usecode_Miscellaneous', 'usecode_Mobile', 'usecode_MultiFamily2To4',
       'usecode_MultiFamily5Plus', 'usecode_SingleFamily', 'usecode_Townhouse',
       'usecode_other'],
      dtype='object')

In [43]:
exclude_columns = ['address', 'z_address', 'lastsoldprice', 'zestimate', 'lastsolddate', 'zindexvalue', 'zpid', 'neighborhood']

In [44]:
get_score(data_processed.drop(columns=exclude_columns), data_processed['zestimate'])

0.6751665144897492

In [46]:
get_score(get_one_hot(data_processed, cols=['neighborhood']).drop(columns=exclude_columns), data_processed['zestimate'])

0.7500171307003617

In [47]:
data_processed = get_one_hot(data_processed, cols=['neighborhood'])

In [48]:
data_processed.head()

Unnamed: 0,address,z_address,bathrooms,bedrooms,finishedsqft,lastsolddate,lastsoldprice,latitude,longitude,neighborhood,...,neighborhood_Sunnyside,neighborhood_Telegraph Hill,neighborhood_Twin Peaks,neighborhood_Van Ness - Civic Center,neighborhood_Visitacion Valley,neighborhood_West Portal,neighborhood_Western Addition,neighborhood_Westwood Highlands,neighborhood_Westwood Park,neighborhood_Yerba Buena
0,Address: 1160 Mission Street #2007,1160 Mission St UNIT 2007,2.0,2.0,1043.0,02/17/2016,1300000.0,37.778705,-122.412635,South of Market,...,0,0,0,0,0,0,0,0,0,0
1,Address: 260 King Street #475,260 King St UNIT 475,1.0,1.0,903.0,02/17/2016,750000.0,37.777641,-122.393417,South of Market,...,0,0,0,0,0,0,0,0,0,0
2,Address: 560 Missouri Street #B,560 Missouri St # B,4.0,3.0,1425.0,02/17/2016,1495000.0,37.759198,-122.396516,Potrero Hill,...,0,0,0,0,0,0,0,0,0,0
3,Address: 350 Missouri Street,350 Missouri St,3.0,3.0,2231.0,02/17/2016,2700000.0,37.761886,-122.396769,Potrero Hill,...,0,0,0,0,0,0,0,0,0,0
4,Address: 3658 Folsom Street,3658 Folsom St,3.0,3.0,1300.0,02/17/2016,1530000.0,37.740795,-122.413453,Bernal Heights,...,0,0,0,0,0,0,0,0,0,0


In [49]:
get_score(data_processed.drop(columns=exclude_columns), data_processed['zestimate'])

0.7500171307003617

In [78]:
data_processed['lastsoldmonth'] = [t.month for t in pd.to_datetime(data['lastsolddate'])]

In [53]:
data_processed['lastsolddate'] = pd.to_datetime(data_processed['lastsolddate'])

In [63]:
data_processed['lastsolddate'] = [t.timestamp() for t in data_processed['lastsolddate']]

In [66]:
exclude_columns = ['address',
 'z_address',
 'lastsoldprice',
 'zestimate',
 'zindexvalue',
 'zpid',
 'neighborhood']

In [68]:
data_processed['lastsolddate'] -= data_processed['lastsolddate'].mean()
data_processed['lastsolddate'] /= data_processed['lastsolddate'].std()

In [69]:
data_processed['lastsolddate']

0        1.230869
1        1.230869
2        1.230869
3        1.230869
4        1.230869
           ...   
11325    1.204338
11326    1.202443
11327    1.202443
11328    1.202443
11329    1.200548
Name: lastsolddate, Length: 11330, dtype: float64

In [82]:
data_processed['lastsoldmonth']

0        2
1        2
2        2
3        2
4        2
        ..
11325    2
11326    2
11327    2
11328    2
11329    2
Name: lastsoldmonth, Length: 11330, dtype: int64

In [85]:
get_score(data_processed.drop(columns=exclude_columns), data_processed['zestimate'])

0.7500105605697285

In [98]:
for month in data_processed['lastsoldmonth'].unique():
    print(month)
    print(data_processed[data_processed['lastsoldmonth'] == month]['zestimate'].count())
    print(data_processed[data_processed['lastsoldmonth'] == month]['zestimate'].mean())
    print(data_processed[data_processed['lastsoldmonth'] == month]['zestimate'].std())

2
817
1502253.364749082
1092435.3831177326
8
1165
1503064.9261802575
1109609.7229528264
6
605
1571185.1933884297
1164659.3598672794
1
759
1493958.184453228
1132562.650724521
3
863
1653870.4716106604
1366803.1472279753
11
1088
1575785.7858455882
1229419.3377919744
12
988
1622167.2358299596
1380084.9981488846
7
1075
1534187.913488372
1276952.7007302365
10
1433
1631256.240055827
1293183.5300640475
9
983
1433455.92878942
939399.9710280234
5
682
1628287.4164222875
1251367.9007591403
4
872
1634824.0699541285
1380372.8745794736


In [93]:
get_score(get_one_hot(data_processed, cols=['lastsoldmonth']).drop(columns=exclude_columns), data_processed['zestimate'])

0.749979955183464

In [97]:
get_score(get_one_hot(data_processed[['lastsoldmonth']], cols=['lastsoldmonth']), data_processed['zestimate'])

0.0032757813975679007

In [101]:
get_score(get_one_hot(data_processed, cols=['zipcode']).drop(columns=exclude_columns), data_processed['zestimate'])

0.7561274146651628

## Создадим свои районы

In [135]:
from sklearn.cluster import DBSCAN, KMeans

In [136]:
len(data_processed['zipcode'].unique())

25

In [137]:
kmeans = KMeans(n_clusters=25)
cluster = kmeans.fit_predict(data_processed[['latitude', 'longitude']])

In [138]:
data_processed['cluster'] = cluster

## Теперь попробуем применить полицейские данные

In [104]:
police_data = pd.read_csv('Police_Department_Incident_Reports__2018_to_Present.csv')

In [105]:
police_data.columns

Index(['Incident Datetime', 'Incident Date', 'Incident Time', 'Incident Year',
       'Incident Day of Week', 'Report Datetime', 'Row ID', 'Incident ID',
       'Incident Number', 'CAD Number', 'Report Type Code',
       'Report Type Description', 'Filed Online', 'Incident Code',
       'Incident Category', 'Incident Subcategory', 'Incident Description',
       'Resolution', 'Intersection', 'CNN', 'Police District',
       'Analysis Neighborhood', 'Supervisor District', 'Latitude', 'Longitude',
       'point', 'SF Find Neighborhoods', 'Current Police Districts',
       'Current Supervisor Districts', 'Analysis Neighborhoods',
       'HSOC Zones as of 2018-06-05', 'OWED Public Spaces',
       'Central Market/Tenderloin Boundary Polygon - Updated',
       'Parks Alliance CPSI (27+TL sites)', 'ESNCAG - Boundary File',
       'Areas of Vulnerability, 2016'],
      dtype='object')

In [106]:
police_data['Police District'].unique()

array(['Taraval', 'Mission', 'Tenderloin', 'Richmond', 'Central',
       'Out of SF', 'Southern', 'Park', 'Bayview', 'Northern',
       'Ingleside'], dtype=object)

In [108]:
police_data['Incident Subcategory'].unique()

array(['Missing Person', 'Stolen Property', 'Other', 'Lost Property',
       'Miscellaneous Investigation', 'Family Offenses',
       'Larceny Theft - Other', 'Non-Criminal', 'Larceny - From Vehicle',
       'Larceny - Auto Parts', 'Warrant', 'Simple Assault', 'Trespass',
       'Theft From Vehicle', 'Fraud', 'Burglary - Residential',
       'Traffic Violation Arrest', 'Burglary - Hot Prowl', 'Loitering',
       'Burglary - Other', 'Weapons Offense', 'Vandalism',
       'Aggravated Assault', 'Motor Vehicle Theft', 'Drug Violation',
       'Larceny Theft - Shoplifting', 'Burglary - Commercial',
       'Recovered Vehicle', 'Robbery - Other', 'Suspicious Occ',
       'Robbery - Carjacking', 'Intimidation', 'Missing Adult',
       'Larceny Theft - From Building', 'Drunkenness', 'Embezzlement',
       'Courtesy Report', 'Motor Vehicle Theft (Attempted)',
       'Child Abuse', 'Robbery - Commercial', 'Other Offenses',
       'Bad Checks', 'Traffic Collision - Hit & Run',
       'Larceny Thef

In [142]:
police_data = police_data.dropna(subset=['Latitude', 'Longitude'])

In [143]:
police_data['cluster'] = kmeans.predict(police_data[['Latitude', 'Longitude']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  police_data['cluster'] = kmeans.predict(police_data[['Latitude', 'Longitude']])


In [144]:
police_data['cluster']

0          5
1          8
2         11
4         16
5          2
          ..
329748     2
329749    12
329750    11
329751    12
329753     2
Name: cluster, Length: 312074, dtype: int32

In [152]:
pivot = get_one_hot(police_data[['Incident Subcategory', 'cluster']], cols=['Incident Subcategory']).pivot_table(columns=['cluster'])

In [153]:
pivot

cluster,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Incident Subcategory_Aggravated Assault,0.045076,0.010232,0.015053,0.015161,0.011163,0.020569,0.023950,0.020383,0.033208,0.036150,...,0.012733,0.013477,0.017912,0.017912,0.013427,0.011065,0.029839,0.008121,0.030846,0.031460
Incident Subcategory_Arrest,0.000150,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000177
Incident Subcategory_Arson,0.005175,0.001659,0.002020,0.001895,0.002402,0.001736,0.003651,0.001973,0.002678,0.003356,...,0.001469,0.001198,0.002382,0.001959,0.001399,0.002438,0.004122,0.001680,0.001902,0.003358
Incident Subcategory_Bad Checks,0.000075,0.000000,0.000111,0.000000,0.000424,0.000000,0.000124,0.000000,0.000214,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000375,0.000000,0.000000,0.000041,0.000177
Incident Subcategory_Bribery,0.000000,0.000000,0.000022,0.000000,0.000000,0.000000,0.000000,0.000047,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000140,0.000083,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Incident Subcategory_Vandalism,0.073127,0.066925,0.059214,0.057170,0.061749,0.066782,0.071477,0.057533,0.074022,0.069860,...,0.050930,0.059599,0.056296,0.061013,0.067133,0.070705,0.078720,0.052506,0.052471,0.063980
Incident Subcategory_Vehicle Impounded,0.003750,0.000000,0.000155,0.000948,0.002402,0.000267,0.000495,0.000470,0.001071,0.001068,...,0.000000,0.001198,0.000265,0.000280,0.000000,0.000938,0.001178,0.000700,0.000165,0.006009
Incident Subcategory_Vehicle Misplaced,0.000300,0.000553,0.000400,0.000632,0.000707,0.000668,0.000681,0.000329,0.000321,0.000000,...,0.000490,0.001198,0.000706,0.000560,0.000839,0.000938,0.000000,0.000280,0.000703,0.000353
Incident Subcategory_Warrant,0.017775,0.010509,0.015364,0.008528,0.010456,0.009617,0.017885,0.013996,0.015533,0.011135,...,0.008080,0.006888,0.018001,0.009796,0.011748,0.008065,0.014134,0.005881,0.026256,0.014493


In [155]:
for intype, row in pivot.iterrows():
    data_processed[intype] = [row[cluster] for cluster in data_processed['cluster']]

In [156]:
data_processed.head()

Unnamed: 0,address,z_address,bathrooms,bedrooms,finishedsqft,lastsolddate,lastsoldprice,latitude,longitude,neighborhood,...,Incident Subcategory_Theft From Vehicle,Incident Subcategory_Traffic Collision,Incident Subcategory_Traffic Collision - Hit & Run,Incident Subcategory_Traffic Violation Arrest,Incident Subcategory_Trespass,Incident Subcategory_Vandalism,Incident Subcategory_Vehicle Impounded,Incident Subcategory_Vehicle Misplaced,Incident Subcategory_Warrant,Incident Subcategory_Weapons Offense
0,Address: 1160 Mission Street #2007,1160 Mission St UNIT 2007,2.0,2.0,1043.0,1.230869,1300000.0,37.778705,-122.412635,South of Market,...,0.006673,0.000638,0.000793,0.015803,0.008085,0.048241,0.00029,0.000464,0.030175,0.012708
1,Address: 260 King Street #475,260 King St UNIT 475,1.0,1.0,903.0,1.230869,750000.0,37.777641,-122.393417,South of Market,...,0.005765,0.000641,0.000838,0.010594,0.013846,0.054474,0.000567,0.000345,0.018281,0.008722
2,Address: 560 Missouri Street #B,560 Missouri St # B,4.0,3.0,1425.0,1.230869,1495000.0,37.759198,-122.396516,Potrero Hill,...,0.02166,0.001238,0.001052,0.013305,0.014543,0.071477,0.000495,0.000681,0.017885,0.012748
3,Address: 350 Missouri Street,350 Missouri St,3.0,3.0,2231.0,1.230869,2700000.0,37.761886,-122.396769,Potrero Hill,...,0.02166,0.001238,0.001052,0.013305,0.014543,0.071477,0.000495,0.000681,0.017885,0.012748
4,Address: 3658 Folsom Street,3658 Folsom St,3.0,3.0,1300.0,1.230869,1530000.0,37.740795,-122.413453,Bernal Heights,...,0.023032,0.0015,0.002464,0.019711,0.009748,0.074022,0.001071,0.000321,0.015533,0.014783


In [157]:
get_score(data_processed.drop(columns=exclude_columns), data_processed['zestimate'])

0.7527214336855328