In [2]:
import pandas as pd
import numpy as np

In [3]:
train_main_df = pd.read_csv('train_main_data.csv')
train_additional_df = pd.read_csv('train_additional_data.csv')

In [4]:
test_main_df = pd.read_csv('test_main_data.csv')
test_additional_df = pd.read_csv('test_additional_data.csv')

In [5]:
test_main_df.head()

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,apartment condition,sub_area
0,74544,2014-03-11,39,,6.0,1.0,1.0,,1.0,1.0,1.0,72
1,78384,2014-12-31,34,,2.0,17.0,1.0,,1.0,0.0,,86
2,30355,2012-08-16,29,18.0,2.0,,,,,,,12
3,16306,2013-05-02,55,37.0,3.0,,,,,,,118
4,48126,2013-03-07,32,16.0,9.0,,,,,,,28


In [6]:
train_main_df = train_main_df.sort_values(by='timestamp')
train_ids = train_main_df.id.drop_duplicates()
train_df = train_main_df[:round(len(train_ids) * 0.7)]
valid_df = train_main_df[round(len(train_ids) * 0.7):]

In [7]:
train_df.shape

(20300, 13)

In [8]:
valid_df.shape

(8700, 13)

Рекомендуемые этапы анализа данных
- Preprocessing(missing values,data type ...)
- EDA(univariate and muiltuvariate analysis)
- Feature engineering

- Hypertuning(protuning of all available params)
- Feature selection
- Post analysis on test dataset

# Data Preprocessing 

In [8]:
def preproccessing(df,add_df,model_for_na=None):
    
    df = df[df['full_sq'] > 0]
    df = df.merge(add_df, how= 'left', on='id')
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['year'] = df['timestamp'].dt.year.astype(int)
    df['month'] = df['timestamp'].dt.month.astype(int)
    if model_for_na:
        no_life_sq = model_for_na.predict(df[df.life_sq.isnull()].full_sq.values.reshape(-1,1))
        df['life_sq_predicted'] = df.life_sq
        df.loc[df.life_sq.isnull(),'life_sq_predicted'] = list(no_life_sq.reshape(1,-1)[0])
    else:
        df['life_sq'].fillna(0,inplace =True)
    
    df = df.fillna(-20) # since all values are positive, negative values is set for NAs so model could distinguish them
    return df

## Feature Engineering

In [9]:
training_set = preproccessing(train_df, train_additional_df)

In [10]:
valid_set = preproccessing(valid_df,train_additional_df)

In [11]:
def feature_gen(df):
    # get week of the year
    df['week_of_year'] = df['timestamp'].dt.isocalendar().week.astype(int)
    
    # get day of week
    df['day_of_week'] = df['timestamp'].dt.weekday
    
    #df['timestamp_int'] = df['timestamp'].astype(int)
    
    # get ratio of squares
    df["ratio_life_dash_full_sq"] = df["life_sq"] / df["full_sq"]
    df["ration_kitchen_dash_full_sq"] = df["kitch_sq"] / df["full_sq"]
    
    
    # age of building
    df['age'] = df["build_year"] - df['year']
    
    # difference between full area and living area
    df['some_extra_sqr_1'] = df["full_sq"] - df["life_sq"]
    if "life_sq_predicted" in df.columns:
        df['some_extra_sqr_2'] = df["full_sq"] - df['life_sq_predicted']
    df.drop(columns=['timestamp'],inplace=True)
    return df

In [12]:
training_set_1  = feature_gen(training_set)
valid_set_1  = feature_gen(valid_set)

# Modeling

## Building regression model

In [13]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder


In [14]:
class lin_model:
    def __init__(self, degree=1, regularization = None, lambda_=0):
        if regularization:
            # self.linear_model = Lasso(alpha=lambda_)
            self.linear_model = Ridge(alpha=lambda_)
        else:
            self.linear_model = LinearRegression()
        
        # self.encoder = OneHotEncoder()
        self.poly = PolynomialFeatures(degree, include_bias=False)
        self.scaler = StandardScaler()
        
    def fit(self, X_train,y_train,X_valid,y_valid,score=False):
        ''' just fits the data. mapping and scaling are not repeated '''
        # X_train_encoded = self.encoder.fit_transform(X_train)
        X_train_mapped = self.poly.fit_transform(X_train)
        X_train_mapped_scaled = self.scaler.fit_transform(X_train_mapped)
        
        self.linear_model.fit(X_train_mapped_scaled, y_train)
        yhat_valid = self.predict(X_valid,preprocessed=False) 
        yhat_train = self.predict(X_train_mapped_scaled) 
        if score:
            valid_score = self.scores(y_valid,yhat_valid,name='_valid')
            train_score = self.scores(y_train,yhat_train,name='_train')
            print(pd.concat([train_score,valid_score],axis=1))
            # return pd.concat([train_score,valid_score],axis=1)
            
    def predict(self, X,preprocessed=True):
        # X_encoded = self.encoder.transform(X)
        if not preprocessed:
            X_mapped = self.poly.transform(X)
            X = self.scaler.transform(X_mapped)
             
        yhat = self.linear_model.predict(X)
        return(yhat)
    
    def scores(self, y, yhat,name=''):
        mse = mean_squared_error(y,yhat)/2   #sklean doesn't have div by 2
        rms = mean_squared_error(y, yhat, squared=False)/2
        # print()
        return pd.DataFrame({f'RMSE{name}':[rms],f'MSE{name}':[mse]})
    

In [15]:
lin_model = lin_model()

In [16]:
%%time

lin_model.fit(training_set_1.drop(columns='price'),
              training_set_1['price'],
              valid_set_1.drop(columns='price'),
              valid_set_1['price'],
              score=True)

     RMSE_train     MSE_train    RMSE_valid     MSE_valid
0  1.597508e+06  5.104065e+12  2.092741e+06  8.759129e+12
CPU times: total: 15.6 ms
Wall time: 130 ms


## Taske regarding above model

1) Deal with overfitting (variance)

## Building Neural Network for Regression

In [16]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.activations import relu,linear
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam

In [32]:
from functools import partial
class NN_model:
    def __init__(self, degree=3, regularization = None, lambda_=0):
        if regularization:
            pass
            # self.linear_model = Lasso(alpha=lambda_)
            # self.linear_model = Ridge(alpha=lambda_)
        else:
            RegularizedDense = partial(Dense,
                          activation="relu",
                          kernel_initializer="he_normal",
                          kernel_regularizer=tf.keras.regularizers.L2(l2=0.01))
            self.model = Sequential([
                                    RegularizedDense(25),
                                    RegularizedDense(15),
                                    RegularizedDense(1, activation="linear")],\
                                    # kernel_initializer="glorot_uniform"
                                                     
                                    name='NN_model')
            self.model.compile(
                                    loss='mean_squared_error',
                                    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                                )
             


        
        # self.encoder = OneHotEncoder()
        self.poly = PolynomialFeatures(degree, include_bias=False)
        self.scaler = StandardScaler()
        
    def fit(self, X_train,y_train,X_valid,y_valid,score=False):
        ''' just fits the data. mapping and scaling are not repeated '''
        # X_train_encoded = self.encoder.fit_transform(X_train)
        X_train_mapped = self.poly.fit_transform(X_train)
        X_train_mapped_scaled = self.scaler.fit_transform(X_train_mapped)
           
        self.model.fit(X_train_mapped_scaled, y_train,epochs=40)
        yhat_valid = self.predict(X_valid,preprocessed=False) 
        yhat_train = self.predict(X_train_mapped_scaled) 
        if score:
            valid_score = self.scores(y_valid,yhat_valid,name='_valid')
            train_score = self.scores(y_train,yhat_train,name='_train')
            print(pd.concat([train_score,valid_score],axis=1))
            # return pd.concat([train_score,valid_score],axis=1)
            
    def predict(self, X,preprocessed=True):
        # X_encoded = self.encoder.transform(X)
        if not preprocessed:
            X_mapped = self.poly.transform(X)
            X = self.scaler.transform(X_mapped)
             
        yhat = self.model.predict(X)
        return(yhat)
    
    def scores(self, y, yhat,name=''):
        mse = mean_squared_error(y,yhat)/2   #sklean doesn't have div by 2
        rms = mean_squared_error(y, yhat, squared=False)/2
        # print()
        return pd.DataFrame({f'RMSE{name}':[rms],f'MSE{name}':[mse]})
    

In [33]:
%%time
NN_model = NN_model()
NN_model.fit(training_set_1.drop(columns='price'),training_set_1['price'],\
              valid_set_1.drop(columns='price'),valid_set_1['price'],\
              score=True)



Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


ValueError: y_true and y_pred have different number of output (1!=10)

In [None]:
# prediction = model.predict(image_of_two.reshape(1,400))  # prediction\