In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression 
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import warnings
import pickle
warnings.filterwarnings("ignore")

 ## Data Cleaning

In [None]:
gsheetkey = '1r4Q0OFPHCFGvd0lzbDQtQHaylc9NY1oSaVnWFzfwcUw'
data_url = f'https://docs.google.com/spreadsheet/ccc?key={gsheetkey}&output=csv'
data = pd.read_csv(data_url)

In [None]:
data.head()

Unnamed: 0,city,division,description,link,House No.,Flat No.,Area,Price per month,Number of rooms,Floor,No. of floors,Build year,Building type,Heating system,energy_class,Nearest kindergarten,Nearest educational institution,Nearest shop,Public transport stop
0,,,,,,,,,,,,,,,,,,,
1,Vilnius,Žvėrynas,"Vilnius, Žvėrynas, Lūšių g., 4 rooms flat for ...",https://en.aruodas.lt/butu-nuoma-vilniuje-zver...,25.0,19.0,90 m²,850€,4.0,3.0,4.0,2003.0,Brick,Gas,,290 m,350 m,340 m,290 m
2,Vilnius,Naujamiestis,"Vilnius, Naujamiestis, Savanorių pr., 2 rooms ...",https://en.aruodas.lt/butu-nuoma-vilniuje-nauj...,1.0,,90 m²,1 000 €,2.0,8.0,27.0,2007.0,Monolithic,"Central, central thermostat",1 Class Good (1-3),240 m,180 m,460 m,80 m
3,Vilnius,Šnipiškės,"Vilnius, Šnipiškės, Juozo Balčikonio g., 1 roo...",https://en.aruodas.lt/butu-nuoma-vilniuje-snip...,19.0,,36 m²,530€,1.0,3.0,5.0,2020.0,Brick,Central thermostat,,270 m,880 m,520 m,380 m
4,Vilnius,Šnipiškės,"Vilnius, Šnipiškės, Juozo Balčikonio g., 1 roo...",https://en.aruodas.lt/butu-nuoma-vilniuje-snip...,19.0,,24 m²,450€,1.0,3.0,5.0,2020.0,Brick,Central thermostat,,270 m,880 m,520 m,380 m


In [None]:
data.columns.values

array(['city', 'division', 'description', 'link', 'House No.', 'Flat No.',
       'Area', 'Price per month', 'Number of rooms', 'Floor',
       'No. of floors', 'Build year', 'Building type', 'Heating system',
       'energy_class', 'Nearest kindergarten',
       'Nearest educational institution', 'Nearest shop',
       'Public transport stop'], dtype=object)

In [None]:
data.dropna(how='all', inplace=True)

In [None]:
data.dropna(subset=['Nearest kindergarten', 'Nearest educational institution', 'Nearest shop', 'Public transport stop'], inplace=True)

In [None]:
data['Area'] = data['Area'].apply(lambda x: x.replace(' m²', '').replace(',', '.') if not pd.isnull(x) else x)

In [None]:
data[['Nearest kindergarten', 'Nearest educational institution', 'Nearest shop', 'Public transport stop']] = data[['Nearest kindergarten', 'Nearest educational institution', 'Nearest shop', 'Public transport stop']].applymap(lambda x: x.replace(' m', '').replace(' km', '').replace(',', '.') if not pd.isnull(x) else x)

In [None]:
data['Price per month'] = data['Price per month'].apply(lambda x: x.replace('€', '').replace(' ', '') if not pd.isnull(x) else x)

In [None]:
data['Build year'] = data['Build year'].apply(lambda x: (re.sub("[^0-9]","",str(x).split(",")[0])))

In [None]:
# data['Build year'] = data['Build year'].apply(lambda x: re.sub('(\d\d\d\d [a-zA-Z,]+) ', repl= '', string=x) if not pd.isnull(x) else x)

In [None]:
# data['Build year'] = data['Build year'].apply(lambda x: re.sub('[a-zA-Z,]+', repl= '', string=x) if not pd.isnull(x) else x)

In [None]:
data['energy_class'] = data['energy_class'].apply(lambda x: x.split()[0] if not pd.isnull(x) else x)

In [None]:
data = data.astype({'Area': 'float', 'Nearest kindergarten': 'float', 'Nearest educational institution': 'float',
                'Nearest shop': 'float', 'Public transport stop': 'float', 'Price per month': 'float', 'Build year': 'int', 'energy_class': 'float'})

In [None]:
data.columns

Index(['city', 'division', 'description', 'link', 'House No.', 'Flat No.',
       'Area', 'Price per month', 'Number of rooms', 'Floor', 'No. of floors',
       'Build year', 'Building type', 'Heating system', 'energy_class',
       'Nearest kindergarten', 'Nearest educational institution',
       'Nearest shop', 'Public transport stop'],
      dtype='object')

In [None]:
data.rename(columns={'House No.': 'house_no', 'Flat No.': 'flat_no',
       'Area': 'area', 'Price per month': 'price_per_month', 'Number of rooms': 'no_of_rooms', 
       'Floor': 'floor', 'No. of floors': 'no_of_floors', 'Build year': 'build_year', 
       'Building type': 'building_type', 'Heating system': 'heating_system',
       'Nearest kindergarten': 'nearest_kindergarten', 'Nearest educational institution': 'nearest_educational_institution',
       'Nearest shop': 'nearest_shop', 'Public transport stop': 'public_transport_stop'}, inplace=True)

##Data Preparation

In [None]:
model_data = data[['division', 'no_of_rooms', 'area','floor', 'no_of_floors', 'build_year', 'building_type', 'nearest_kindergarten', 'nearest_educational_institution',
                   'nearest_shop', 'public_transport_stop']]

In [None]:
num_attribs = model_data._get_numeric_data().columns.to_list()

In [None]:
cat_attribs = model_data.select_dtypes(include='O').columns.to_list()

In [None]:
num_pipeline = Pipeline(steps=[('imputer', SimpleImputer(strategy="median"))])
cat_pipeline = Pipeline(steps=[('imputer',SimpleImputer(strategy="most_frequent")),('encoder',OneHotEncoder())])

In [None]:
preprocess = ColumnTransformer(transformers=[("num", num_pipeline, num_attribs), ("cat", cat_pipeline, cat_attribs)])

In [None]:
model = make_pipeline(preprocess, LinearRegression())

In [None]:
x = model_data
y = data['price_per_month']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=2)

In [None]:
X_train.columns

Index(['division', 'no_of_rooms', 'area', 'floor', 'no_of_floors',
       'build_year', 'building_type', 'nearest_kindergarten',
       'nearest_educational_institution', 'nearest_shop',
       'public_transport_stop'],
      dtype='object')

In [None]:
model.fit(X_train,y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median'))]),
                                                  ['no_of_rooms', 'area',
                                                   'floor', 'no_of_floors',
                                                   'build_year',
                                                   'nearest_kindergarten',
                                                   'nearest_educational_institution',
                                                   'nearest_shop',
                                                   'public_transport_stop']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   

In [None]:
model.score(X_test, y_test)

0.8143224766178463

In [None]:
model.predict(pd.DataFrame([{
    "division": "Šnipiškės",
    "area":  71.78,
    "no_of_rooms":  2,
    "build_year":  1940,
    "floor":  1,
    "nearest_kindergarten": 120,
    "nearest_educational_institution": 310.0,
    "nearest_shop": 170.0,
    "public_transport_stop": 80.0,
    "building_type": "Brick",
    "no_of_floors": 3}]))

array([804.30882851])

In [None]:
with open("linear_regression.pkl", "wb") as f:
    pickle.dump(model, f)