In [3]:
import numpy as np
import pandas as pd

In [4]:
data = pd.read_csv('train.csv')

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
train, valid = train_test_split(data, test_size=0.3, random_state=42)

In [7]:
valid.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
6252,3702,21,2.0,48.812195,34.679779,5.0,5,5.0,1963,0.034331,B,B,25,5487,3,,0,4,B,181530.459031
4684,12848,61,3.0,81.103039,49.310278,8.0,4,4.0,1960,0.298205,B,B,16,4048,3,,1,3,B,260456.004692
1731,2239,6,3.0,82.882978,3.97765,1.0,8,17.0,1977,0.243205,B,B,5,1564,0,540.0,0,0,B,219945.30464
4742,15611,23,1.0,33.863324,29.993297,0.0,5,4.0,1977,0.034656,B,B,0,168,0,,0,0,B,66883.280318
4521,5634,52,1.0,43.095135,,1.0,10,17.0,1977,0.371149,B,B,34,7065,1,750.0,2,5,B,114086.065201


In [8]:
mean_price = train['Price'].mean()
mean_price2 = train.groupby(['Rooms'], as_index=False)[['Price']].mean().rename(columns={'Price':'mean_price_r'})
mean_price1 = train.groupby(['DistrictId', 'Rooms'], as_index=False)[['Price']].mean().rename(columns={'Price':'mean_price_dr'})

In [9]:
def clean (df):
    df.loc[df['Rooms'] > 5, 'Rooms'] = 5
    df.loc[df['Square'] < 15, 'Square'] = 15
    df.loc[df['HouseYear'] > 2019, 'HouseYear'] = 2019
    return df

In [10]:
def add_mean_price(df, mean_price1=mean_price1, mean_price2=mean_price2, mean_price=mean_price):
    df = pd.merge(df, mean_price1, on=['DistrictId', 'Rooms'], how='left')
    df = pd.merge(df, mean_price2, on=['Rooms'], how='left')
    
    df['mean_price_r'] = df['mean_price_r'].fillna(mean_price)
    df['mean_price_dr'] = df['mean_price_dr'].fillna(df['mean_price_r'])
    return df

In [11]:
def prepare_data(df, mean_price1=mean_price1, mean_price2=mean_price2, mean_price=mean_price):
    df = clean(df)
    df = add_mean_price(df)
    return df

In [12]:
train = prepare_data(train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [13]:
valid = prepare_data(valid)

In [14]:
from sklearn.ensemble import RandomForestRegressor as RF

In [15]:
feats = ['Rooms', 'Square', 'HouseYear', 'Social_1', 'Ecology_1']

In [16]:
model = RF(n_estimators=100, max_depth=10, random_state=42) 

In [17]:
model.fit(train.loc[:, feats], train['Price'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [18]:
pred_train = model.predict(train.loc[:, feats])

In [19]:
pred_valid = model.predict(valid.loc[:, feats])

In [20]:
pred_train

array([122371.3785937 , 221828.07819905, 182281.80837876, ...,
       237253.19421065, 217425.48404253, 359500.30842544])

In [21]:
pred_valid

array([207128.98141774, 368907.32576551, 216447.42386005, ...,
       225049.21554356, 110021.51009548, 267648.85700004])

In [22]:
from sklearn.metrics import r2_score as r2

In [23]:
r2(valid['Price'], pred_valid)

0.6562161240341158

In [24]:
r2(train['Price'], pred_train)

0.791682005462138

In [38]:
test = pd.read_csv('test.csv')

In [40]:
test = prepare_data(test)

In [41]:
test['Price'] = model.predict(test.loc[:, feats])

In [44]:
test.loc[:, ['Id', 'Price']].to_csv('TKisteneva_predictions.csv', index=None)