# Import Required Libraries
Import the necessary libraries, including pandas, scikit-learn, and matplotlib.

In [None]:
# Import Required Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import re
import matplotlib.pyplot as plt

# Load and Combine Datasets
Load the datasets from CSV files, extract dates from filenames, and combine them into a single DataFrame.

In [None]:
# Load and Combine Datasets

# Define the file paths
files = [
    r'C:\Users\yagiz\OneDrive\Masaüstü\Uygulamalar\kodlar\ist_rent\dataset\5_9_2022_sahibinden_ev.csv',
    r'C:\Users\yagiz\OneDrive\Masaüstü\Uygulamalar\kodlar\ist_rent\dataset\22_5_2022_sahibinden_ev.csv',
    r'C:\Users\yagiz\OneDrive\Masaüstü\Uygulamalar\kodlar\ist_rent\dataset\26_5_2022_sahibinden_ev.csv'
]

# Initialize an empty list to store DataFrames
dataframes = []

# Loop through each file and load the data
for file in files:
    df = pd.read_csv(file)
    
    # Extract the date from the filename
    date_match = re.search(r'(\d{1,2})_(\d{1,2})_(\d{4})', file)
    if date_match:
        day, month, year = map(int, date_match.groups())
        df['date'] = pd.Timestamp(year=year, month=month, day=day)
    
    # Append the DataFrame to the list
    dataframes.append(df)

# Combine all DataFrames into a single DataFrame
data = pd.concat(dataframes, ignore_index=True)

# Display a message indicating that the data has been loaded and combined
print("Veri yüklendi ve birleştirildi.")

# Clean the Data
Remove missing values, convert columns to numeric types, and clean invalid data.

In [None]:
# Clean the Data

# Remove missing values
data = data.dropna()
print("Eksik veriler temizlendi.")

# Convert columns to numeric types
data['price'] = pd.to_numeric(data['price'].replace('[^0-9]', '', regex=True), errors='coerce')
data['area'] = pd.to_numeric(data['area'].replace('[^0-9]', '', regex=True), errors='coerce')
data['rooms'] = pd.to_numeric(data['numberOfRooms'].replace('[^0-9]', '', regex=True), errors='coerce')

# Remove invalid data
data = data.dropna()
print("Geçersiz veriler temizlendi.")

# Feature and Target Selection
Select features (area, rooms, town, year) and target variable (price) for the model.

In [None]:
# Feature and Target Selection

# Select features (area, rooms, town, year) and target variable (price)
X = data[['area', 'rooms', 'town', 'date']]  # Features
y = data['price']  # Target variable

# Extract year from date for the 'year' feature
X['year'] = X['date'].dt.year

# Drop the 'date' column as it's no longer needed
X = X.drop(columns=['date'])

# Encode categorical variables (town) using one-hot encoding
X = pd.get_dummies(X, columns=['town'], drop_first=True)

# Display the first few rows of the feature set and target variable
X.head(), y.head()

# Encode Categorical Variables
Encode categorical variables like town using one-hot encoding.

In [None]:
# Encode Categorical Variables

# Encode categorical variables (town) using one-hot encoding
X = pd.get_dummies(X, columns=['town'], drop_first=True)

# Display the first few rows of the feature set and target variable
X.head(), y.head()

# Split Data into Training and Test Sets
Split the data into training and test sets using train_test_split.

In [None]:
# Split Data into Training and Test Sets

# Split the data into training and test sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and test sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Train the Model
Train a RandomForestRegressor model using the training data.

In [None]:
# Train the Model

# Initialize the RandomForestRegressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model using the training data
model.fit(X_train, y_train)

# Display a message indicating that the model has been trained
print("Model eğitildi.")

# Evaluate the Model
Evaluate the model using mean squared error (MSE) and R² score.

In [None]:
# Evaluate the Model

# Make predictions using the test data
predictions = model.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, predictions)

# Calculate R² Score
r2 = r2_score(y_test, predictions)

# Print the evaluation metrics
print(f'Model Mean Squared Error: {mse}')
print(f'Model R² Score: {r2}')

# Plotting the predictions vs actual values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, predictions, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted Rent Prices')
plt.grid(True)
plt.show()

# Plotting the R² graph
plt.figure(figsize=(10, 6))
plt.scatter(y_test, predictions, alpha=0.7, color='blue', label='Predictions')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', label='Perfect Fit (y=x)')
plt.title(f'Real vs Predicted Values (R²: {r2:.2f})', fontsize=14)
plt.xlabel('Real Values (y_test)', fontsize=12)
plt.ylabel('Predicted Values', fontsize=12)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Plotting the MSE graph
plt.figure(figsize=(10, 6))
plt.bar(['Mean Squared Error'], [mse])
plt.title('Model Mean Squared Error')
plt.ylabel('Error')
plt.grid(True)
plt.show()

# Prepare Data for Future Predictions
Prepare data for future predictions from 2022 to 2025.

In [None]:
# Prepare Data for Future Predictions

# Define the future years for prediction
future_years = pd.date_range(start='2022', end='2025', freq='Y').year

# Create a DataFrame for future data
future_data = pd.DataFrame({
    'area': [85] * len(future_years),
    'rooms': [3] * len(future_years),
    'year': future_years,
    'town_Büyükada': [1] * len(future_years)  # Example dummy value for a specific town
})

# Identify missing columns in the future data
missing_cols = set(X.columns) - set(future_data.columns)

# Add missing columns to the future data with default values
for col in missing_cols:
    future_data[col] = 0

# Ensure the future data columns match the training data columns
future_data = future_data[X.columns]

# Display a message indicating that the future data is ready
print("Gelecek yıllar için veri hazırlandı.")

# Make predictions for future years
future_predictions = model.predict(future_data)

# Display a message indicating that predictions have been made
print("Gelecek yıllar için tahminler yapıldı.")

# Plot the future predictions
plt.figure(figsize=(10, 6))
plt.plot(future_years, future_predictions, label='Predicted Prices')
plt.xlabel('Year')
plt.ylabel('Price (Million TL)')
plt.title('Predicted Rent Prices from 2022 to 2025')
plt.legend()
plt.grid(True)
plt.show()

# Plot Predictions vs Actual Values
Plot the predicted vs actual rent prices for the test set.

In [None]:
# Plot Predictions vs Actual Values

# Plotting the predictions vs actual values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, predictions, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted Rent Prices')
plt.grid(True)
plt.show()

# Plot Future Predictions
Plot the predicted rent prices from 2022 to 2025.

In [None]:
# Plot Future Predictions

# Plot the future predictions
plt.figure(figsize=(10, 6))
plt.plot(future_years, future_predictions, label='Predicted Prices')
plt.xlabel('Year')
plt.ylabel('Price (Million TL)')
plt.title('Predicted Rent Prices from 2022 to 2025')
plt.legend()
plt.grid(True)
plt.show()

# Plot R² Graph
Plot the R² graph to visualize the model's performance.

In [None]:
# Plot R² Graph

# Plotting the R² graph to visualize the model's performance
plt.figure(figsize=(10, 6))
plt.scatter(y_test, predictions, alpha=0.7, color='blue', label='Predictions')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', label='Perfect Fit (y=x)')
plt.title(f'Real vs Predicted Values (R²: {r2:.2f})', fontsize=14)
plt.xlabel('Real Values (y_test)', fontsize=12)
plt.ylabel('Predicted Values', fontsize=12)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Plot MSE Graph
Plot the mean squared error graph to visualize the model's error.

In [None]:
# Plot MSE Graph

# Plotting the Mean Squared Error (MSE) graph to visualize the model's error
plt.figure(figsize=(10, 6))
plt.bar(['Mean Squared Error'], [mse])
plt.title('Model Mean Squared Error')
plt.ylabel('Error')
plt.grid(True)
plt.show()