In [None]:
import pandas as pd


# LoadING THE DATASET

data = pd.read_csv('Ireland House Price Final.csv')

# THIS IS THE BASIC INFORMATION OF THE DATASET
print("Basic Info:")
data.info()

# first few rows
print("\nFirst 5 Rows:")
print(data.head())

#THIS WILL DISPLAY THE NUMBER OF MISSING VALUES IN THE DATASET
print("\nMissing Values Count:")
print(data.isnull().sum())

# DISPLAYING THE STATISTICS FOR ALL THE NUMERICALL COLUMNS
print("\nSummary Statistics for Numerical Columns:")
print(data.describe())


In [None]:
#IN THIS CODE I WILL BE USING PLOTS TO PLOT FOR NUMERICAL AND CATEGORICAL COLUMNS

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# HERE CONVERTING TOTAL_SQFT COLUMN TO NUMERIC TO PLOT THE GRAPGH EFFECTIVELY
data['total_sqft'] = pd.to_numeric(data['total_sqft'], errors='coerce')

# 1. Plot for numerical columns
numerical_columns = ['total_sqft', 'bath', 'balcony', 'price-per-sqft-$']
for column in numerical_columns:
    plt.figure(figsize=(8, 4))
    sns.histplot(data[column].dropna(), bins=30, kde=True)
    plt.title(f'Distribution of {column}')
    plt.xlabel(column)
    plt.ylabel('Count')
    plt.show()

# 2.categorical columns
categorical_columns = ['property_scope', 'location', 'size', 'buying or not buying', 'BER', 'Renovation needed']

#PLOT FOR CATEGORICAL COLUMN
for column in categorical_columns:
    plt.figure(figsize=(10, 5))
    sns.countplot(data=data, y=column, order=data[column].value_counts().index)
    plt.title(f'Count Plot for {column}')
    plt.xlabel('Count')
    plt.ylabel(column)
    plt.show()


In [None]:
#IN THIS CODE I WILL BE PERFORMING CORRELATIONS BETWEEN NUMERICAL VARIABLES BEFORE STARTING WITH FURTHER ANALYSIS 

import seaborn as sns
import matplotlib.pyplot as plt


# Correlation Between Numerical Variables USING HEATMAP
numerical_cols = ['total_sqft', 'bath', 'balcony', 'price-per-sqft-$']
correlation_matrix = data[numerical_cols].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()



In [None]:
#IN THIS PART I HAVE STANDARDIZED ALL THE COLUMNS OF THE DATASET
import pandas as pd
import re
import numpy as np
# Load the dataset
data = pd.read_csv('Ireland House Price Final.csv')

# Step 1: CleanING numerical columns
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns.tolist()

def convert_sqft_to_num(sqft):
    if isinstance(sqft, str):
        if '-' in sqft:
            sqft_range = sqft.split('-')
            return (float(sqft_range[0]) + float(sqft_range[1])) / 2
        else:
            try:
                return float(sqft)
            except ValueError:
                return None  # Handle errors gracefully
    return sqft


#FOR SIZE COLUMN MAKING IT EVEN BY USING REGULAR EXPRESSION
def extract_numeric_size(value):
    # Use regular expression to extract numbers from the 'size' column
    numeric_value = ''.join([char for char in str(value) if char.isdigit()])
    return float(numeric_value) if numeric_value else None  
    
# APPLYING FUNCTION TO THE SIZE COLUMN
data['size'] = data['size'].apply(extract_numeric_size)
# Clean the numerical columns: convert strings to numbers if needed

# Step 1: REMOVING any SPACES IN THE AVAILABILITY COLUMN AND THEN CONVERTING TO LOWER CASE
data['availability'] = data['availability'].str.strip().str.lower()

# Step 2: REPLACING "immediate possession" with "ready to move"
data['availability'] = data['availability'].replace({
    'immediate possession': 'ready to move'
})

# Step 3: DATES '18-apr', IN AVAILABILITY COLUMN REPLACING WITH MORE MEAINING FULL TERM
def handle_date_entries(value):
    # Regex to identify date-like strings (e.g., '18-apr', '12-may', etc.)
    if isinstance(value, str):
        match = re.match(r'(\d{1,2})-(\w{3})', value)
        if match:
            day, month = match.groups()
            # REPLACING WITH "Available in [NAME OF THE MONTH]" 
            return f"Available from {month.capitalize()}"
    return value

# Step 2: function to clean the 'availability' column
data['availability'] = data['availability'].apply(handle_date_entries)

#THE FOLLOWING ARE THE CONVERSION FACTORS TO CONVERT EACH INTO SQUARE FEET UNIT
conversion_factors = {
    'Acres': 43560,
    'Sq. Meter': 10.7639,
    'Sq. Yards': 9,
    'Cents': 435.6,
    'Grounds': 2400,
    'Guntha': 1089,
    'Perch': 272.25
}

# Function to convert the units to square feet
def convert_to_sqft(value):
    if isinstance(value, str):
        value = re.sub(r'[^\d.-]', '', value)  
        
    
        if value.endswith('.'):
            value = value[:-1]

        # IF ITS A RANGE (e.g. '1000 - 1285'), THEN IT WILL SPLIT AND RETURN THE AVERAGE VALUE
        if '-' in value:
            values = value.split('-')
            value1 = convert_single_value(values[0].strip())
            value2 = convert_single_value(values[1].strip())
            return (value1 + value2) / 2  # Return the average of the two values

        # If it's not a range, CONVERTING THE VALUE TO SQFT
        return convert_single_value(value)
    
    return value  # Return the value as-is if it's not a string

# Function to convert a single value to square feet based on the unit
def convert_single_value(value):
    # Check for different units and convert
    if 'Acres' in value:
        return float(value) * conversion_factors['Acres']
    elif 'Sq. Meter' in value:
        return float(value) * conversion_factors['Sq. Meter']
    elif 'Sq. Yards' in value:
        return float(value) * conversion_factors['Sq. Yards']
    elif 'Cents' in value:
        return float(value) * conversion_factors['Cents']
    elif 'Grounds' in value:
        return float(value) * conversion_factors['Grounds']
    elif 'Guntha' in value:
        return float(value) * conversion_factors['Guntha']
    elif 'Perch' in value:
        return float(value) * conversion_factors['Perch']
    else:  # For numeric values without units
        return float(value)  # Assume it's in square feet if no unit

# Apply conversion to the 'total_sqft' column
data['total_sqft'] = data['total_sqft'].apply(convert_to_sqft)



# Ensure other numerical columns are numeric
data[numerical_columns] = data[numerical_columns].apply(pd.to_numeric, errors='coerce')

# Step 2: Clean categorical columns
categorical_columns = data.select_dtypes(include=['object']).columns.tolist()

# Clean categorical columns: strip extra spaces and handle inconsistencies
data[categorical_columns] = data[categorical_columns].apply(lambda x: x.str.strip() if x.dtype == 'object' else x)

data['price-per-sqft-$'] = data['price-per-sqft-$'].round(2)
# Step 3: Display the cleaned data
print("\nCleaned Data:")
print(data.head())




# Display the summary of the cleaned data
print("\nData Info After Cleaning:")
print(data.info())
data1 = data.to_csv('standardize.csv',index=False)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv('standardize.csv')

print("\nData before outlier removal:")

print("\nBox plot for numeric columns to detect outliers before excluding to fill missing values:")
for col in numeric_columns:
    plt.figure(figsize=(8, 4))
    sns.boxplot(data[col], orient='h')
    plt.title(f'Box Plot of {col} (initial outliers in the dataset)')
    plt.xlabel(col)
    plt.show()
    
# Function to detect and remove outliers using the IQR method
def remove_outliers(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return series[(series >= lower_bound) & (series <= upper_bound)]

# Identify numerical and categorical columns
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = data.select_dtypes(include=['object']).columns


# Fill missing values for numerical columns with mean or median (excluding outliers)
for col in numerical_cols:
    if data[col].isnull().sum() > 0:  # Only process columns with missing values
        non_null_data = data[col].dropna()  # Remove missing values first
        non_outlier_data = remove_outliers(non_null_data)  # Exclude outliers
        
        # Calculate the mean or median without outliers
        if col == 'price-per-sqft-$':  # Example: Use median for specific columns if required
            fill_value = non_outlier_data.median()
        else:  # Default to mean
            fill_value = non_outlier_data.mean()
        
        # Fill missing values
        data[col] = data[col].fillna(fill_value)

# Fill missing values for categorical columns with mode (mode doesn't depend on outliers)
for col in categorical_cols:
    if data[col].isnull().sum() > 0:  # Only process columns with missing values
        data[col] = data[col].fillna(data[col].mode()[0])  # Fill with the most frequent value

# Save the updated dataset
data.to_csv('missing.csv', index=False)

# Display a summary of missing values after filling
print("Missing values after filling:")
print(data.isnull().sum())


In [None]:
#FEATURE ENGINEERING
import pandas as pd
import numpy as np

# Load the dataset
data = pd.read_csv('missing.csv')

# 1. Create a new feature for the total house price if not already present
if 'total_price' not in data.columns:
    data['total_price'] = data['total_sqft'] * data['price-per-sqft-$']

# 3. Save the updated dataset with new features
data.to_csv('feature_engineering.csv', index=False)

# Display the updated data with new features
print("New features have been created and saved to 'feature_engineering.csv'.")
print(data.head())  # Preview the updated data with new features




In [None]:
#filled missing values excluding outliers and detected outliers before removing missing values
import pandas as pd

# Load the dataset
data = pd.read_csv('missing.csv')

# Identify numeric columns
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Function to remove outliers using IQR method
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)  # First quartile
    Q3 = df[column].quantile(0.75)  # Third quartile
    IQR = Q3 - Q1  # Interquartile range
    
    # Define bounds
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Filter data
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

# Remove outliers for each numeric column
for col in numeric_columns:
    print(f"Removing outliers in column: {col}")
    data = remove_outliers_iqr(data, col)
    
data['balcony'] = pd.to_numeric(data['balcony'], errors='coerce').fillna(0).astype(int)
data['bath'] = pd.to_numeric(data['bath'], errors='coerce').fillna(0).astype(int)
data['size'] = pd.to_numeric(data['size'], errors='coerce').fillna(0).astype(int)

# Verify the results
print("\nData after outlier removal:")
print(data.describe())
print("\nBox plot for numeric columns after removing outliers:")
for col in numeric_columns:
    plt.figure(figsize=(8, 4))
    sns.boxplot(data[col], orient='h')
    plt.title(f'Box Plot of {col} (After Outlier Removal)')
    plt.xlabel(col)
    plt.show()


# Save the cleaned data
data.to_csv('outliers.csv', index=False)
print("Cleaned data saved to 'outliers.csv'")


In [None]:
#pearson correlation 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the cleaned dataset
data = pd.read_csv('outliers.csv')

# Identify numeric columns
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Compute the Pearson correlation matrix
correlation_matrix = data[numeric_columns].corr(method='pearson')

# Display the correlation matrix
print("Pearson Correlation Matrix:")
print(correlation_matrix)

# Visualize the correlation matrix using a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', cbar=True)
plt.title("Pearson Correlation Heatmap")
plt.show()


In [None]:
#predictive analysis techniques used :Random forest, linear regression, SVM to check accuracy
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv('outliers.csv')

# Identify numerical columns and target variable
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
target_col = 'price-per-sqft-$'


X = data[numerical_cols].drop(columns=[target_col])  # Features (excluding target)
y = data[target_col]  # Target variable


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ----------------- 1. Random Forest Regressor ------------------

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)


rf_predictions = rf_model.predict(X_test_scaled)


rf_rmse = np.sqrt(mean_squared_error(y_test, rf_predictions))

# ----------------- 2. Linear Regression ------------------

lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)


lr_predictions = lr_model.predict(X_test_scaled)


lr_rmse = np.sqrt(mean_squared_error(y_test, lr_predictions))

# ----------------- 3. Support Vector Machine (SVM) ------------------

svm_model = SVR(kernel='rbf', C=1.0, epsilon=0.1)
svm_model.fit(X_train_scaled, y_train)


svm_predictions = svm_model.predict(X_test_scaled)


svm_rmse = np.sqrt(mean_squared_error(y_test, svm_predictions))

# ----------------- MAPE Calculation ------------------

# Function to calculate MAPE
def calculate_mape(y_true, y_pred):
    # Avoid division by zero and calculate the percentage error
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Calculate MAPE for all models
rf_mape = calculate_mape(y_test, rf_predictions)
lr_mape = calculate_mape(y_test, lr_predictions)
svm_mape = calculate_mape(y_test, svm_predictions)

# Display the results
print(f"Random Forest RMSE: {rf_rmse}")
print(f"Linear Regression RMSE: {lr_rmse}")
print(f"SVM RMSE: {svm_rmse}")

print(f"Random Forest Accuracy (MAPE): {100 - rf_mape:.2f}%")
print(f"Linear Regression Accuracy (MAPE): {100 - lr_mape:.2f}%")
print(f"SVM Accuracy (MAPE): {100 - svm_mape:.2f}%")

# ----------------- Save Predictions ------------------

data['rf_predicted_price'] = rf_model.predict(scaler.transform(X))  # Random Forest Predictions
data['lr_predicted_price'] = lr_model.predict(scaler.transform(X))  # Linear Regression Predictions
data['svm_predicted_price'] = svm_model.predict(scaler.transform(X))  # SVM Predictions

# Save the updated dataset with predictions
data.to_csv('predicted_prices_with_svm.csv', index=False)

# Display the first few rows of the data with predictions
print("\nData with predictions:")
print(data[['ID', 'price-per-sqft-$', 'rf_predicted_price', 'lr_predicted_price', 'svm_predicted_price']].head())

# ----------------- Plot Accuracy Comparison ------------------
# Bar chart to compare the accuracy of models
models = ['Random Forest', 'Linear Regression', 'SVM']
accuracies = [100 - rf_mape, 100 - lr_mape, 100 - svm_mape]  # Accuracy is complement of MAPE

plt.figure(figsize=(10, 6))
plt.barh(models, accuracies, color=['blue', 'green', 'red'])
plt.xlabel('Accuracy (%)')
plt.title('Model Accuracy Comparison')
plt.xlim(0, 100)
plt.show()


In [None]:
#predictive techniques used : random forest, linear regression, SVM and model accuracy inccreased used pca and hyperparameter
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt


data = pd.read_csv('outliers.csv')

# Identify numerical columns and target variable
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
target_col = 'price-per-sqft-$'

# Separate features and target
X = data[numerical_cols].drop(columns=[target_col])  # Features (excluding target)
y = data[target_col]  # Target variable

# Apply log transformation for skewed target
y = np.log1p(y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply PCA 
pca = PCA(n_components=5) 
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# ----------------- Random Forest with Reduced Hyperparameter Grid ------------------
param_grid_rf = {
    'n_estimators': [50, 100],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
}

rf_model = RandomizedSearchCV(RandomForestRegressor(random_state=42, n_jobs=-1),
                               param_distributions=param_grid_rf,
                               n_iter=5,
                               cv=2,
                               scoring='neg_mean_squared_error',
                               random_state=42)
rf_model.fit(X_train_pca, y_train)
rf_best = rf_model.best_estimator_
rf_predictions = np.expm1(rf_best.predict(X_test_pca))  

# ----------------- Linear Regression ------------------
lr_model = LinearRegression()
lr_model.fit(X_train_pca, y_train)
lr_predictions = np.expm1(lr_model.predict(X_test_pca))  

# ----------------- SVM with Reduced Hyperparameter Grid ------------------
param_grid_svm = {
    'C': [0.1, 1],
    'kernel': ['linear'],
}


svm_model = RandomizedSearchCV(SVR(),
                                param_distributions=param_grid_svm,
                                n_iter=2,  
                                cv=2,
                                scoring='neg_mean_squared_error',
                                random_state=42)
svm_model.fit(X_train_pca, y_train)
svm_best = svm_model.best_estimator_
svm_predictions = np.expm1(svm_best.predict(X_test_pca))  

# ----------------- Evaluation ------------------
def evaluate_model(y_true, y_pred, model_name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = mean_absolute_percentage_error(y_true, y_pred) * 100
    accuracy = 100 - mape  # Accuracy as complement of MAPE
    print(f"{model_name} RMSE: {rmse:.2f}")
    print(f"{model_name} Accuracy: {accuracy:.2f}%")
    return rmse, accuracy

y_test_exp = np.expm1(y_test)  

rf_rmse, rf_accuracy = evaluate_model(y_test_exp, rf_predictions, "Random Forest")
lr_rmse, lr_accuracy = evaluate_model(y_test_exp, lr_predictions, "Linear Regression")
svm_rmse, svm_accuracy = evaluate_model(y_test_exp, svm_predictions, "SVM")

# ----------------- Save Predictions ------------------
data['rf_predicted_price'] = np.expm1(rf_best.predict(pca.transform(scaler.transform(X))))
data['lr_predicted_price'] = np.expm1(lr_model.predict(pca.transform(scaler.transform(X))))
data['svm_predicted_price'] = np.expm1(svm_best.predict(pca.transform(scaler.transform(X))))

data.to_csv('optimized_predictions_fast.csv', index=False)

print("\nOptimized predictions saved to 'optimized_predictions_fast.csv'.")

# ----------------- Plot Accuracy Comparison ------------------
# Bar chart to compare the accuracy of models
models = ['Random Forest', 'Linear Regression', 'SVM']
accuracies = [rf_accuracy, lr_accuracy, svm_accuracy]

plt.figure(figsize=(10, 6))
plt.barh(models, accuracies, color=['blue', 'green', 'red'])
plt.xlabel('Accuracy (%)')
plt.title('Model Accuracy Comparison')
plt.xlim(0, 100)
plt.show()