In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Lasso, Ridge

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
pd.options.display.max_columns = None

In [6]:
data = pd.read_csv('input/train.csv')

### train_test_split 

In [7]:
train, valid = train_test_split(data, test_size=0.3, random_state=42)

In [8]:
train.shape, valid.shape

((7000, 20), (3000, 20))

### prepare data

In [9]:
def prepare_rooms(df, source_df):
    df.loc[df['Rooms'] > 5, 'Rooms'] = source_df['Rooms'].median()
    return df

In [10]:
train = prepare_rooms(train, train)

In [11]:
stats1 = train.groupby(['DistrictId', 'Rooms'], as_index=False)[['Price']].mean().rename(columns={'Price':'mean_price'})
stats1

Unnamed: 0,DistrictId,Rooms,mean_price
0,0,1.0,145789.889600
1,0,2.0,198451.332808
2,0,3.0,291670.749162
3,1,1.0,147116.367405
4,1,2.0,198151.757027
5,1,3.0,247145.388423
6,1,4.0,277290.705058
7,2,1.0,184560.609064
8,2,2.0,247244.824542
9,2,3.0,338384.676484


In [12]:
def join_stats1(df, stats1, source_df, mode='train'):
    df = pd.merge(df, stats1, on=['DistrictId', 'Rooms'], how='left')
    if mode == 'test':
        df['mean_price'] = df['mean_price'].fillna(source_df['mean_price'].mean())
    return df

In [13]:
def fillna_life_square(df, source_df):
    df['LifeSquare'] = df['LifeSquare'].fillna(source_df['LifeSquare'].mean())
    return df

In [14]:
train = join_stats1(train, stats1, train)
train = fillna_life_square(train, train)

In [15]:
valid = prepare_rooms(valid, train)
valid = join_stats1(valid, stats1, train, mode='test')
valid = fillna_life_square(valid, train)

### Model

In [16]:
from sklearn.linear_model import LinearRegression as LR
from sklearn.ensemble import RandomForestRegressor as RF

In [17]:
model = RF(n_estimators=100, max_depth=11, min_samples_leaf=2)

In [18]:
feats = ['Square', 'Rooms', 'mean_price', 'LifeSquare']

In [19]:
model.fit(train.loc[:, feats], train['Price'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=11,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [20]:
pred_train = model.predict(train.loc[:, feats])

In [21]:
pred_valid = model.predict(valid.loc[:, feats])

In [22]:
pred_train.shape, pred_valid.shape

((7000,), (3000,))

### Evaluation

In [23]:
from sklearn.metrics import r2_score as r2, mean_squared_error as mse

In [24]:
r2(train['Price'], pred_train), mse(train['Price'], pred_train)

(0.8740193194781968, 1087690066.4639924)

In [25]:
r2(valid['Price'], pred_valid), mse(valid['Price'], pred_valid)

(0.625580661106611, 3220663420.5951986)

### Test

In [26]:
test = pd.read_csv('input/test.csv')

In [27]:
test.shape

(5000, 19)

In [28]:
test = prepare_rooms(test, train)
test = join_stats1(test, stats1, train, mode='test')
test = fillna_life_square(test, train)

In [29]:
test['Price'] = model.predict(test.loc[:, feats])

In [30]:
test['Price'].describe()

count      5000.000000
mean     212944.778738
std       76574.545859
min       67691.188704
25%      165380.337413
50%      196862.146104
75%      244277.113170
max      573199.129562
Name: Price, dtype: float64

In [31]:
test.loc[:, ['Id', 'Price']].to_csv('AAnonymous_predictions.csv', index=None)