In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [33]:
df = pd.read_csv("DATA/train.csv")

In [34]:
df.columns

Index(['id', 'timestamp', 'full_sq', 'life_sq', 'floor', 'max_floor',
       'material', 'build_year', 'num_room', 'kitch_sq',
       ...
       'cafe_count_5000_price_2500', 'cafe_count_5000_price_4000',
       'cafe_count_5000_price_high', 'big_church_count_5000',
       'church_count_5000', 'mosque_count_5000', 'leisure_count_5000',
       'sport_count_5000', 'market_count_5000', 'price_doc'],
      dtype='object', length=292)

In [35]:
len(df.columns)

292

In [36]:
len(df)

30471

In [37]:
# removing outliers and filling null
def rem_out(df, col):
    avg = np.median(df[col][df[col].isnull() == False])
    df[col] = df[col].fillna(avg)
    ulimit = np.percentile(df[col], 95)
    llimit = np.percentile(df[col], 5)
    
    mask = df[col] > ulimit
    df.loc[mask, col]  = ulimit

    mask = df[col] < llimit
    df.loc[mask, col]  = llimit

    return df

df = rem_out(df, "price_doc")
df = rem_out(df, "full_sq")
df = rem_out(df, "metro_km_walk")
df = rem_out(df, "kremlin_km")
df = rem_out(df, "park_km")
df = rem_out(df, "kitch_sq")
df = rem_out(df, "public_healthcare_km")
df = rem_out(df, "school_km")

In [38]:
df = df.join(pd.get_dummies(df["incineration_raion"], prefix = "is_inc"))

In [39]:
#features = ["full_sq", "metro_km_walk", "kremlin_km", "park_km", "sport_count_5000", 
#            "market_count_5000", "kitch_sq", "public_healthcare_km", "school_km", "is_inc_no", "is_inc_yes"]

features = ["full_sq", "metro_km_walk", "kremlin_km", "park_km", "sport_count_5000", 
            "market_count_5000", "kitch_sq", "public_healthcare_km", "school_km",
           "trc_sqm_5000", "prom_part_2000", "green_part_2000", "green_part_5000", "exhibition_km", 
            "workplaces_km", "detention_facility_km", "swim_pool_km", "hospice_morgue_km", "university_km",
           "shopping_centers_km", "preschool_km", "cemetery_km", "hospital_beds_raion", "healthcare_centers_raion",
           "oil_chemistry_km", ]

#features = ["full_sq", "metro_km_walk"]
#features = ["full_sq"]

In [40]:
for f in features:
    df = rem_out(df, f)

In [41]:
X_train = df[features]

In [42]:
Y_train = df["price_doc"]

In [43]:
df_test = pd.read_csv("DATA/test.csv")
df_test = df_test.join(pd.get_dummies(df_test["incineration_raion"], prefix = "is_inc"))

In [44]:
for f in features:
    df_test = rem_out(df_test, f)

#df_test = rem_out(df_test, "price_doc")
df_test = rem_out(df_test, "full_sq")
df_test = rem_out(df_test, "metro_km_walk")
df_test = rem_out(df_test, "kremlin_km")
df_test = rem_out(df_test, "park_km")
df_test = rem_out(df_test, "kitch_sq")
df_test = rem_out(df_test, "public_healthcare_km")
df_test = rem_out(df_test, "school_km")

In [45]:
X_test = df_test[features]

In [46]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, y_test = train_test_split(X_train, Y_train, test_size=0.15, random_state=42)

from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()
X_train = minmax.fit_transform(X_train)
X_test = minmax.transform(X_test)

In [47]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

#model = LinearRegression(normalize=False)
#model = RandomForestRegressor()
model = GradientBoostingRegressor(max_depth=7)
model.fit(X_train, Y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=7, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)

In [48]:
Y_test = model.predict(X_test)

In [49]:
len(X_test)

4571

In [50]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, Y_test)

1408501.0156787517

In [19]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, Y_test)

1407556.0104772036

In [118]:
from sklearn.metrics import mean_squared_error
mean_absolute_error(y_test, Y_test)

1474679.6027736373

In [354]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, Y_test)

7828267725115.5156

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, Y_test)

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, Y_test)

In [144]:
out = pd.DataFrame()
out['id'] = df_test["id"]
out['price_doc'] = np.round(Y_test, 2)
out.to_csv("out12.csv", index=False)

In [None]:
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_test, y_pred)

In [None]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_test, y_pred)


In [None]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, y_pred)

In [None]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, y_pred)


In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, Y_train)

In [None]:
out = pd.DataFrame()
out['id'] = df_test["id"]
out['price_doc'] = np.round(Y_test, 2)
out.to_csv("out5.csv", index=False)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.15, random_state=42)

In [None]:
df["num_room"].count()

In [None]:
df["kremlin_km"].count()

In [None]:
df["park_km"].count()

## NEURAL NETWORK ATTEMPT

In [20]:
import keras
from keras.models import Sequential
from keras.layers import Lambda, Flatten, Dense
from keras.layers.convolutional import Conv2D, Cropping2D
from keras.layers.pooling import MaxPooling2D
from keras.layers.core import Dropout
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

Using TensorFlow backend.


In [27]:
def model():
    model = Sequential()
    model.add(Dense(30, input_shape=[X_train.shape[1]], kernel_initializer='normal', activation='relu'))
    #model.add(Dense(25, kernel_initializer='normal', activation='relu'))
    model.add(Dense(20, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.9))
    #model.add(Dense(15, kernel_initializer='normal', activation='relu'))
    #model.add(Dense(12, kernel_initializer='normal', activation='relu'))
    model.add(Dense(10, kernel_initializer='normal', activation='relu'))
    model.add(Dense(5, kernel_initializer='normal', activation='relu'))
    #model.add(Dropout(0.8))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [28]:
#m = model()
#m.fit(X_train.as_matrix(), Y_train.as_matrix(), epochs=1, batch_size=128, verbose=0)
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=model, epochs=25, batch_size=16, verbose=2)))
pipeline = Pipeline(estimators)

#pipeline = Pipeline(estimators)
#kfold = KFold(n_splits=10, random_state=seed)
#results = cross_val_score(pipeline, X, Y, cv=kfold)
#print("Standardized: %.2f (%.2f) MSE" % (results.mean(), results.std()))
#estimator = KerasRegressor(build_fn=model, nb_epoch=100, batch_size=128, verbose=0)

In [29]:
pipeline.fit(X_train, Y_train)

Epoch 1/25
6s - loss: 62353983318064.3906
Epoch 2/25
6s - loss: 45346762403070.1406
Epoch 3/25
6s - loss: 42918977626333.4062
Epoch 4/25
6s - loss: 40867944428594.7656
Epoch 5/25
6s - loss: 41597843759295.0391
Epoch 6/25
6s - loss: 40994812367250.9609
Epoch 7/25
6s - loss: 40741533718143.0703
Epoch 8/25
6s - loss: 39877663844352.3125
Epoch 9/25
6s - loss: 39781801607170.5312
Epoch 10/25
6s - loss: 40664457697160.7578
Epoch 11/25
6s - loss: 39512658897589.6328
Epoch 12/25
6s - loss: 40136599767294.6172
Epoch 13/25
6s - loss: 39345408834187.2500
Epoch 14/25
6s - loss: 39110018809858.5312
Epoch 15/25
6s - loss: 38374600983669.1875
Epoch 16/25
6s - loss: 37084463518572.1328
Epoch 17/25
6s - loss: 37288722274005.1016
Epoch 18/25
6s - loss: 37194899896573.9844
Epoch 19/25
6s - loss: 37718008126955.3594
Epoch 20/25
6s - loss: 37263716316144.9766
Epoch 21/25
6s - loss: 36637711640614.2734
Epoch 22/25
6s - loss: 37464357073692.4297
Epoch 23/25
6s - loss: 37118720038843.3672
Epoch 24/25
6s - los

Pipeline(steps=[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('mlp', <keras.wrappers.scikit_learn.KerasRegressor object at 0x0000019424AF9DD8>)])

In [30]:
y_keras_predict = pipeline.predict(X_test.as_matrix())

In [31]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_test, y_keras_predict)

3268035.7674025106

In [34]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_test, y_keras_predict)

1647484.5254320716

In [25]:
mean_absolute_error(y_test, Y_test)

1479700.9086362007

In [384]:
out = pd.DataFrame()
out['id'] = df_test["id"]
out['price_doc'] = np.round(y_keras_predict, 2)
out.to_csv("out11.csv", index=False)

In [None]:
kfold = KFold(n_splits=10, random_state=42)
results = cross_val_score(estimator, X_train.as_matrix(), Y_train.as_matrix(), cv=kfold)
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))

In [1]:
from xgboost import XGBRegressor



OSError: [WinError 126] The specified module could not be found