**Verilerin lineer regresyon ile tahmin edilmesi**

In [63]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


In [64]:
df=pd.read_csv('/content/data_cleand.csv')

In [65]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3586 entries, 0 to 3585
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   city          3586 non-null   object
 1   district      3586 non-null   object
 2   neighborhood  3586 non-null   object
 3   room          3586 non-null   int64 
 4   living-room   3586 non-null   int64 
 5   area          3586 non-null   int64 
 6   age           3586 non-null   int64 
 7   floor         3586 non-null   int64 
 8   price         3586 non-null   int64 
dtypes: int64(6), object(3)
memory usage: 252.3+ KB
None


In [66]:
df['city'] = df['city'].astype('category')
df['district'] = df['district'].astype('category')
df['neighborhood'] = df['neighborhood'].astype('category')
df['room'] = df['room'].astype('int')
df['living-room'] = df['living-room'].astype('int')
df['area'] = df['area'].astype('int')
df['age'] = df['age'].astype('int')
df['floor'] = df['floor'].astype('int')
df['price'] = df['price'].astype('int')

In [67]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3586 entries, 0 to 3585
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   city          3586 non-null   category
 1   district      3586 non-null   category
 2   neighborhood  3586 non-null   category
 3   room          3586 non-null   int64   
 4   living-room   3586 non-null   int64   
 5   area          3586 non-null   int64   
 6   age           3586 non-null   int64   
 7   floor         3586 non-null   int64   
 8   price         3586 non-null   int64   
dtypes: category(3), int64(6)
memory usage: 205.2 KB
None


In [68]:
categorical_features = ['city', 'district', 'neighborhood']
numerical_features = ['room', 'living-room', 'area', 'age', 'floor']

In [69]:
'''
standardScaler ile ortalamaya civarında dagılım için,
one hot encoder ile kategorik degişkenler örn sehirler bulunma durumuna göre 0 0 1 0 gibi degerler alır
eger egitim modelinin görmedigi veriler testte karsımıza cıkarsa modelin görmezden gelmesini istiyoruz ignore
'''
full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

In [71]:
X = df.drop('price', axis=1)
y = df['price']

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [73]:
model = Pipeline([
    ('preparation', full_pipeline),
    ('model', LinearRegression())
])

In [74]:
model.fit(X_train, y_train)

In [75]:
y_pred=model.predict(X_test)
mse=mean_squared_error(y_test,y_pred)#hataların kareleri ort
rmse=np.sqrt(mse)
r2=r2_score(y_test,y_pred)

In [76]:
print(f"MSE:{mse}")
print(f"RMSE:{rmse}")
print(f"R^2: {r2}")#dusuk dogruluk

MSE:45861594.09372739
RMSE:6772.11887770197
R^2: 0.4800527335293512


In [77]:
feature_importance=model.named_steps['model'].coef_
print(feature_importance)#az katogorimiz olmasına ragmen her benzersiz deger sutun olarak eklenmiş

[ 4.77738382e+02  0.00000000e+00  4.02488273e+03 -2.94176347e+03
  1.79303782e+02 -2.16727249e+03 -3.54047096e+03  3.63368365e+03
  2.37524036e+02  1.83653576e+03 -5.98454196e+03  1.77754194e+03
 -2.57242326e+03 -8.43732561e+03  1.46379895e+04 -1.62524442e+03
 -4.58204804e+03  9.46583736e+03 -6.26511292e+03 -3.44385055e+03
 -3.37780271e+03  2.71981436e+04  1.36081816e+03  2.56645000e+03
 -1.20965665e+03  3.38789058e+03  1.31645459e+04  2.61784328e+03
 -7.92600937e+03  5.90541854e+03 -6.39787319e+03 -4.34112440e+03
  2.65178236e+04  2.64703491e+03 -1.65556351e+04 -4.68963473e+02
  3.94375763e+03 -6.20511891e+03  2.15183599e+04  2.94589493e+03
  6.47325796e+03 -6.02480351e+03  3.79924862e+03  2.37524036e+02
 -3.20116626e+03  2.47395701e+03 -5.29113413e+03 -4.59307398e+03
 -1.10979316e+04 -8.25390515e+03 -3.39304692e+02 -3.78580446e+03
 -2.68694243e+03 -9.93721314e+03 -1.47598684e+03 -3.82979649e+03
 -9.97924043e+03  7.10992748e+02  5.52660974e+03 -8.98790621e+03
 -5.15483350e+03  3.94612

In [79]:
print("Numerical Features")
for i in range(len(numerical_features)):
    print(numerical_features[i], feature_importance[i])
#önem degerleri asagıdaki gibi

Numerical Features
room 477.7383822808313
living-room 0.0
area 4024.8827343390117
age -2941.76346508019
floor 179.303782386625


In [81]:
print("Categorical Features")
for i in range(len(categorical_features)):
    for j in range(len(model.named_steps['preparation'].transformers_[1][1].categories_[i])):
        print(model.named_steps['preparation'].transformers_[1][1].categories_[i][j], feature_importance[len(numerical_features) + j])
#En degerli evler izmir mugla aydın ,afyon  denizli daha dusuk

Categorical Features
aydin -2167.2724903801895
denizli -3540.470956086505
izmir 3633.683646102207
kutahya 237.52403629110717
mugla 1836.5357641491723
aliaga -2167.2724903801895
balcova -3540.470956086505
bayrakli 3633.683646102207
bergama 237.52403629110717
bodrum 1836.5357641491723
bornova -5984.541963558784
buca 1777.5419361112392
cesme -2572.423256214912
cigli -8437.325605600296
cine 14637.989492041826
dalaman -1625.2444152358362
datca -4582.048037541318
didim 9465.837362604538
dikili -6265.112916354263
efeler -3443.850550679455
fethiye -3377.8027061749235
foca 27198.14361808869
gaziemir 1360.8181620465552
germencik 2566.450001410197
guzelbahce -1209.6566526797385
incirliova 3387.890575003928
karabaglar 13164.545908477052
karaburun 2617.8432812084475
karsiyaka -7926.009370485816
kavaklidere 5905.418535447164
kemalpasa -6397.873193472481
konak -4341.124396745104
koycegiz 26517.823553848044
kusadasi 2647.034911693383
marmaris -16555.635120295516
menderes -468.9634730805645
menemen 394

In [84]:
new_data = pd.DataFrame({
    'city': ['izmir'],
    'district': ['urla'],
    'neighborhood': ['gulbahce'],
    'room': [4],
    'living-room': [1],
    'area': [200],
    'age': [5],
    'floor': [3]
})

print(model.predict(new_data))

[41058.36703586]


In [85]:
print(df[(df['city'] == 'izmir') & (df['district'] == 'urla') & (df['neighborhood'] == 'gulbahce')])


       city district neighborhood  room  living-room  area  age  floor  price
1451  izmir     urla     gulbahce     2            1    90    6      0  30000
1919  izmir     urla     gulbahce     1            1    70    7      0  22000
2320  izmir     urla     gulbahce     2            1    70    0      1  25000
2349  izmir     urla     gulbahce     2            1    70    6      1  21000
2372  izmir     urla     gulbahce     1            1    45    7      0  19000
2604  izmir     urla     gulbahce     2            1    90   12      0  18000
2606  izmir     urla     gulbahce     1            1    35   15      0  18000
2878  izmir     urla     gulbahce     1            1    40    7      0  22000
3131  izmir     urla     gulbahce     2            1   120   26      1  35000
3403  izmir     urla     gulbahce     2            1   120   30      2  35000
3421  izmir     urla     gulbahce     1            1    50   10      1  25000


**dogrulugu arttırmak için fiyat aralıgı yapılabilir**