### Project

In [53]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import RandomForestRegressor

In [54]:
def preproc(df_input):
    
    df_output = df_input.copy()
        
    Square = dict(df_output.groupby('Rooms')['Square'].median())
    idx = (df_output['Square'] < 15) & (df_output['Rooms'] < 2)
    df_output.loc[idx, 'Square'] = df_output.loc[idx, 'Rooms'].apply(lambda x: Square[x])
    
    idx = (df_output['Square'] < 30) & (df_output['Rooms'] > 1)
    df_output.loc[idx, 'Square'] = df_output.loc[idx, 'Rooms'].apply(lambda x: Square[x])
    
    idx = (df_output['Square'] < df_output['LifeSquare'])
    df_output.loc[idx, 'Square'] = df_output.loc[idx, 'LifeSquare']

    LifeSquare = dict(df_output.groupby('Rooms')['LifeSquare'].median())
    idx = (df_output['LifeSquare'].isnull())
    df_output.loc[idx, 'LifeSquare'] = df_output.loc[idx, 'Rooms'].apply(lambda x: LifeSquare[x])

    idx = (df_output['Rooms'] < 1)
    df_output.loc[idx, 'KitchenSquare'] = 0.0
    
    KitchenSquare = dict(df_output.groupby('Rooms')['KitchenSquare'].median())
    idx = (df_output['Rooms'] > 0)
    df_output.loc[idx, 'KitchenSquare'] = df_output.loc[idx, 'Rooms'].apply(lambda x: KitchenSquare[x])
    
    idx = (df_output['HouseFloor'] < df_output['Floor'])
    df_output.loc[idx, 'HouseFloor'] = df_output.loc[idx, 'Floor']
    
    idx = (df_output['HouseYear'] >= 2019)
    df_output.loc[idx, 'HouseYear'] = 2019
   
    df_output['Healthcare_1'].fillna(df_output['Healthcare_1'].median(), inplace=True)
    
    df_output['Ecology_2'] = df_output['Ecology_2'].apply(lambda x: 1 if x == 'A' else 0)
    df_output['Ecology_3'] = df_output['Ecology_3'].apply(lambda x: 1 if x == 'A' else 0)
    df_output['Shops_2'] = df_output['Shops_2'].apply(lambda x: 1 if x == 'A' else 0)
    
    return df_output

In [55]:
data = pd.read_csv('/Users/timursasin/Downloads/project_task/train.csv', index_col = 0, encoding = 'utf-8')

In [56]:
new_data = pd.read_csv('/Users/timursasin/Downloads/project_task/test.csv', index_col = 0, encoding = 'utf-8')

In [57]:
cleared_data = data.fillna(0)
cleared_data = data[data['Rooms'] < 10]
cleared_data = cleared_data[cleared_data['Square'] < 250]
cleared_data = cleared_data[cleared_data['HouseFloor'] < 100]
processed_data = preproc(cleared_data)

In [58]:
X = processed_data.drop(labels = ['DistrictId', 'Price'], axis = 1)
y = processed_data['Price'].values

In [59]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.3, random_state = 42, shuffle = True)

In [60]:
scaler = MinMaxScaler()

X_scaled = scaler.fit_transform(X)

model = RandomForestRegressor(
    n_estimators = 400,
    max_depth = 26,
    max_features = 0.5,
    min_samples_leaf = 1,
    min_samples_split = 5,
    random_state = 42,
    n_jobs = -1
)

model.fit(X_scaled, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=26,
                      max_features=0.5, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=5,
                      min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=-1,
                      oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [61]:
processed_new_data = preproc(new_data)

In [62]:
X_new = processed_new_data.drop(labels = ['DistrictId'], axis = 1)

In [63]:
X_new_scaled = scaler.transform(X_new)

In [64]:
y_pred_new = model.predict(X_new_scaled)

In [65]:
predictions = pd.DataFrame({
    'Id': X_new.index,
    'Price': y_pred_new
})

In [66]:
predictions.to_csv('TSasin_predictions.csv', sep = ',', index = False, encoding = 'utf-8')