In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from scipy.stats import uniform, truncnorm, randint
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.svm import LinearSVR
from sklearn.svm import SVR

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('.\models\SVM\output.csv')
df.describe()

Unnamed: 0,floor,total_floor_num,rent_price,room_num,area,year,year_renovation,top_floor,oldtown
count,2302.0,2302.0,2302.0,2302.0,2302.0,2302.0,80.0,2302.0,2302.0
mean,3.377932,5.809296,523.900956,2.02563,53.301108,1990.491746,2013.175,0.216334,0.205039
std,2.249265,3.226175,235.495612,0.789294,21.369041,25.717765,7.577072,0.411834,0.403818
min,0.0,1.0,155.0,1.0,16.0,1903.0,1965.0,0.0,0.0
25%,2.0,4.0,360.0,1.0,38.0,1975.0,2011.75,0.0,0.0
50%,3.0,5.0,460.0,2.0,50.0,2000.5,2016.0,0.0,0.0
75%,4.0,7.0,650.0,2.0,65.0,2011.0,2017.25,0.0,0.0
max,20.0,29.0,1480.0,6.0,150.0,2019.0,2019.0,1.0,1.0


In [3]:
df.head()

Unnamed: 0,Adresas,floor,total_floor_num,rent_price,room_num,year_full,house_type,area,heating,year,year_renovation,top_floor,oldtown,district
0,https://www.aruodas.lt/butu-nuoma-vilniuje-sen...,4.0,6.0,890,3.0,2017,Monolitinis,76.0,Centrinis kolektorinis,2017,,0,1.0,senamiestyje
1,https://www.aruodas.lt/butu-nuoma-vilniuje-laz...,2.0,4.0,440,2.0,2008,Mūrinis,68.0,Centrinis kolektorinis,2008,,0,0.0,lazdyneliuose
2,https://www.aruodas.lt/butu-nuoma-vilniuje-sen...,2.0,3.0,811,3.0,1940,Mūrinis,64.0,Dujinis,1940,,0,1.0,senamiestyje
3,https://www.aruodas.lt/butu-nuoma-vilniuje-jus...,4.0,5.0,310,2.0,1985,Blokinis,50.0,Centrinis,1985,,0,0.0,justiniskese
4,https://www.aruodas.lt/butu-nuoma-vilniuje-nau...,4.0,9.0,380,2.0,2017,Mūrinis,34.0,Centrinis kolektorinis,2017,,0,0.0,naujamiestyje


## encoder one-hot

In [4]:
# http://www.insightsbot.com/python-one-hot-encoding-with-scikit-learn/
df["district"].fillna("None", inplace = True)
district_ohe = OneHotEncoder()

X = district_ohe.fit_transform(df.district.values.reshape(-1,1)).toarray()
dfOneHot = pd.DataFrame(X, columns = [district_ohe.get_feature_names()[i].replace("x0_", "") for i in range(X.shape[1])])
df = pd.concat([df, dfOneHot], axis=1)

## pipline and model

In [5]:
Pipeline_svm = Pipeline(steps = [
    ("impute", SimpleImputer(missing_values=np.nan)),
    ("scaler", StandardScaler()),
    ("pca", PCA()),
    ("svm", LinearSVR())    
])

## train_test_split

In [6]:
df_train = df.select_dtypes(include=['float64','int64'])
df_features = df_train.drop(columns="rent_price")
df_target = df_train["rent_price"]
X_train, X_test, y_train, y_test = train_test_split(df_features, df_target, random_state=42)

In [11]:
n_kernel = ["linear","rbf", "sigmoid", "precomputed"]
n_degree = [0.1, 0.5, 1,2,3 ,5, 10]
n_C = [1,10,50, 100, 200, 250 , 300, 350, 400]
n_gamma = [0.001, 0.005, 0.01, 0.1, 1, 5, 10, 50]

random_grid = {"svm__C": n_C
              }


svm_random = RandomizedSearchCV(Pipeline_svm, param_distributions = random_grid, cv = 5, scoring="neg_mean_squared_error", n_iter=100)

In [12]:
svm_random.fit(X_train, y_train)
svm_random.best_estimator_

Pipeline(memory=None,
         steps=[('impute',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='mean',
                               verbose=0)),
                ('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('pca',
                 PCA(copy=True, iterated_power='auto', n_components=None,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('svm',
                 LinearSVR(C=50, dual=True, epsilon=0.0, fit_intercept=True,
                           intercept_scaling=1.0, loss='epsilon_insensitive',
                           max_iter=1000, random_state=None, tol=0.0001,
                           verbose=0))],
         verbose=False)

In [13]:
print("RSME:", int((mean_squared_error(y_test, svm_random.predict(X_test)))**0.5))
print("MAE:", int((mean_absolute_error(y_test, svm_random.predict(X_test)))))

RSME: 116
MAE: 82


In [14]:
print("RSME:", int((mean_squared_error(y_train, svm_random.predict(X_train)))**0.5))
print("MAE:", int((mean_absolute_error(y_train, svm_random.predict(X_train)))))

RSME: 126
MAE: 85
