<a href="https://colab.research.google.com/github/tomPlus353/google-colab-notebooks/blob/main/%5BGrune_account%5DRestaurant_rent_neural_net(Taberuba).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#@title no. 1 setup project



```
-- sql query to export data from taberuba:
SELECT
    surface_area,
    rent_amount,
    latitude,
    longitude,
    date_built,
    city_id
FROM
    properties
WHERE
    rent_amount IS NOT NULL
    AND surface_area IS NOT NULL
    AND date_built IS NOT NULL
    AND latitude IS NOT NULL
    AND longitude IS NOT NULL;
```



In [3]:
from google.colab import files
import os

uploaded = files.upload()

file_name = next(iter(uploaded))
file_path = os.path.join(os.getcwd(), file_name)

print("Uploaded file path:", file_path)

KeyboardInterrupt: 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# file_path = '/content/drive/MyDrive/Your_Folder_Name/your_data.csv' # Example path


In [None]:
!pip install tensorflow

In [None]:
#@title no. 2 build and save model

In [None]:
import pandas as pd

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import math

# load data
with open(file_path) as f:
    data = pd.read_csv(f)

# Convert 'date_built' to datetime
data.dropna(inplace=True)

# Convert 'date_built' to datetime
data['date_built'] = pd.to_datetime(data['date_built'], format='ISO8601')

# Specify the comparison date
comparison_date = pd.to_datetime("2023-11-23")

# Calculate the difference in months
data['months_diff'] = ((comparison_date.year - data['date_built'].dt.year) * 12 +
                     comparison_date.month - data['date_built'].dt.month)

num_count_unique_city = data['city_id'].nunique()
print(f"Number of unique values in 'city_id': {num_count_unique_city}")

from sklearn.cluster import KMeans
latitude = data["latitude"]
longitude = data["longitude"]
kmeans = KMeans(n_clusters=num_count_unique_city)
kmeans.fit(list(zip(latitude, longitude)))
data["area"] = kmeans.labels_

# Display the DataFrame
data.head(20)

# select categorical and numerical features

num_cols = [
           "surface_area",
            "months_diff"
            ]

coor_cols = [
                "latitude",
            "longitude",
]

cat_cols = [
    # "area",
    "city_id",
]

num_and_coor = [
           "surface_area",
            "months_diff",
              "latitude",
            "longitude"
]


# check the data type of each column
print(data.shape)
print(data.dtypes)
print(data.describe())


In [None]:
"""
Duplicate the dataset for better testing
"""

# #iterations
# timesToDuplicate = 3
# for _ in range(timesToDuplicate):
#   # Duplicate the dataset
#   data = pd.concat([data, data], ignore_index=True)

# # check the data type of each column
# print(data.dtypes)
# print(data.describe)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def showCorrelation(data, columns, target):
  columns.append(target)
  # calculate the correlation matrix for the selected columns
  corr_matrix = data[columns].corr()

  # print the correlation matrix
  print(corr_matrix)

  # visualize the correlation matrix using a heatmap
  sns.heatmap(corr_matrix, cmap='coolwarm', annot=True)

showCorrelation(data, ["surface_area","months_diff","area","latitude","longitude"],'rent_amount')



```
# This is formatted as code
```

# knn approach

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.colors as colors
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

def visualise(df, vmin, vmax):

    df_sorted = df.sort_values(by='rent_amount')
    x = df_sorted['longitude']
    y = df_sorted['latitude']
    c = df_sorted['rent_amount']

    plt.rcParams['figure.figsize'] = [5, 6]
    plt.rcParams['figure.dpi'] = 100

    plt.scatter(x, y, s=0.01, c=c, cmap='plasma_r',
                norm=colors.Normalize(vmin=vmin,vmax=vmax), alpha=0.8)
    plt.colorbar()
    plt.show()

visualise(data, 1_0000, 1000_0000)

In [None]:
#set up data
X = data[num_and_coor]
y = data['rent_amount']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

#ranges of hyperparameters to try out
params = {'n_neighbors':range(1,100,1),'weights':['uniform','distance']}

#find ideal hyperparameters
model = GridSearchCV(KNeighborsRegressor(), params, cv=5, verbose=0)
model.fit(X_train.values,y_train.values)
model.best_params_



In [None]:
model.score(X_test.values,y_test.values)


In [None]:
def yenToMan(yen):
  return yen / 10_000

def price(description,lat,lon,months,area):
    features = [[lat,lon,months,area]]
    print("{:30s} -> {:5.0f}man ".format(description,yenToMan(int(model.predict(features)))))

# Examples of new data
price('隆美荘貸店舗　1階', 35.7303492, 139.7879446,((58*12)+2) ,48.49) #18.6
price('メゾンキュート　1階', 35.7156993,139.784519, (38 * 12), 49.65) #18.7
price('Bright亀戸天神 1階', 35.7018962,139.8284366,0,32.96) #20.9
price('TVCビル 3階', 35.7014974,139.8240094,((54*12)+5),75.7) #66

# Standard approach(nn with features as is)

In [None]:
# # one-hot encode categorical features
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(data[cat_cols])
encoded_cat = encoder.transform(data[cat_cols]).toarray()

# standardize numerical features
scaler = StandardScaler()
scaled_num = scaler.fit_transform(data[num_cols])

# combine encoded categorical and scaled numerical features
X = np.hstack((encoded_cat, scaled_num))

# define target variable
y = data['rent_amount']

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train.shape)

In [None]:
# define model
print(X_train.shape[1] / 2);
model = Sequential()
model.add(Dense(int(X_train.shape[1] / 2), activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.3))
model.add(Dense(int(X_train.shape[1] / 4), activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(int(X_train.shape[1] / 8), activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1))

# compile model
model.compile(loss='mae', optimizer='adam')

# early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1) #set patience=3 etc for early stopping

# Train model
with tf.device('/GPU:0'):  # Use the first GPU
  model.fit(X_train,
            y_train,
            epochs=100,
            validation_data=(X_test, y_test),
            callbacks=[
                #early_stop
                       #,cp_callback
                       ])

# evaluate model
mae = model.evaluate(X_test, y_test)
print(f"Mean absolute error: {mae:.2f}")

# # make predictions
# customerTest = ?
# y_pred = model.predict(customerTest)

In [None]:
avg = y.median()
print(f"Mean absolute error as percentage of median price: {(mae/avg)*100}%")

# NN with location cluster

In [None]:
# one-hot encode categorical features
encoder = OneHotEncoder(handle_unknown='error')
encoder.fit(data[cat_cols])
encoded_cat = encoder.transform(data[cat_cols]).toarray()

# standardize numerical features
scaler = StandardScaler()
scaled_num = scaler.fit_transform(data[num_cols]) # can also be num_cols to exclude coordinates

# combine encoded categorical and scaled numerical features
X = np.hstack((encoded_cat, scaled_num))

# define target variable
y = data['rent_amount']

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train.shape)

# define model
model = Sequential()
model.add(Dense(X_train.shape[1], activation='relu'))
# model.add(Dropout(0.5))
model.add(Dense(X_train.shape[1] /2, activation='relu'))
# model.add(Dropout(0.5))
model.add(Dense(X_train.shape[1] /4, activation='relu'))
# model.add(Dropout(0.5))
model.add(Dense(1))

# compile model
model.compile(loss='mae', optimizer='adam')

# early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=10, verbose=1)

# Train model
with tf.device('/GPU:0'):  # Use the first GPU
  model.fit(X_train,
            y_train,
            epochs=100,
            validation_data=(X_test, y_test),
            callbacks=[early_stop
                       #,cp_callback
                       ]
            )

# evaluate model
mae = model.evaluate(X_test, y_test)
print(f"Mean absolute error: {mae:.2f}")

# # make predictions
# customerTest = ?
# y_pred = model.predict(customerTest)