In [152]:
import pandas as pd
import csv
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

def data_prep(start_year = 2005 , end_year =2024,p_price= False , valid = False):
    '''
    This function is a setup function for all the algorithms. 
    It prepares the data for the machine learning/deep learning models:
    1. Load the dataset
    2. Feature engineering
    3. Clean outliers
    4. Drop non-numeric data
    5. Use PCA if needed
    6. Convert columns to numeric if needed
    7. Split the data
    8. Fit and transform the data
    '''
    # Load the dataset
    print("Prep")
    if valid == True:
        df = pd.read_csv("../Data/Test2.csv",index_col=0)
        
    else:
        df = pd.read_csv("../Data/Nadlan_clean.csv",index_col=0)
        
    df = df.drop_duplicates()
    print(df.shape)
    
#     df['Date'] = pd.to_datetime(df['Date'], format='%d.%m.%Y')
#     df['Year'] = df['Date'].dt.strftime('%Y')
    df['Year'] = df['Year'].astype(np.int32)
    df['Size'] = df['Size'].astype(np.int32)
    
    # Remove rows outside the specified time range
    df = df[(df['Year'] >= start_year) & (df['Year'] < end_year)]
    df = df[(df['Price'] > 800000) & (df['Price'] < 12000000)] 
    df =  df[(df['Size'] < 400) & (df['Size'] > 25)]
    df = df[df['Build_year'] > 1910]
    
    # Feature engineering
#     df['AVG_ROOM_SIZE'] = (df["Size"] / df['Rooms']).round(1)
    df['Age'] = 2023 - df['Build_year']
    
    if p_price:
        df['Price'] = df['Predicted_Price']
        df = df.drop(columns=['Year'], axis=1)

  
    df = df.dropna(subset=['Long', 'Lat', 'Rooms', 'Floor', 'Floors'])
    
    df['Floors'] = df['Floors'].astype(np.int32)
    df['Floor'] = df['Floor'].astype(np.int32)
    df['Lat'] = df['Lat'].astype(np.int32)
    df['Long'] = df['Long'].astype(np.int32)
    df['Rooms'] = df['Rooms'].astype(np.int32)

    
    def drop_columns(df,cols):
        for col in cols:
            try:
                df.drop(columns = [col], inplace = True)
            except:
                pass # print(f"col: '{col}' not found")

        return df
    

    columns=['Home_number','Rebuilt','Gush','Helka','Tat','Percentage_Change','Predicted_Price' ,'Neighborhood','street_id']
#     print(df.dtypes)

    
    df = drop_columns(df,columns)
    
    non_numeric_cols = list(df.select_dtypes(exclude=['number']).columns)
    na_cols = list(df.columns[df.isna().any()])
    cols_to_drop = list(set(non_numeric_cols) | set(na_cols))
    
    df.drop(cols_to_drop, axis=1 , inplace = True)
    df = df.reindex(columns=["Rooms", "Floor", "Size", "Price","Build_year", "Floors", "Long", "Lat", "Year", "Distance_sea", "Train",'Age'])
    if valid == True:
        
        # For the validtaion data
        df.dropna(inplace=True)
        return df 
    

    def clean_outliers(df):
        for col in df.columns:
            q1, q3 = np.percentile(df[col], [25, 75])
            iqr = q3 - q1
            lower_bound = q1 -(1.5 * iqr) 
            upper_bound = q3 +(1.5 * iqr)
            df = df[(df[col] > lower_bound) & (df[col] < upper_bound)]
        
        return df
        
    df = clean_outliers(df)
    
    
    y = df['Price']
    X = df.drop('Price', axis=1)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
    scaler = MinMaxScaler()
    
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    
    return X_train_scaled,  X_test_scaled  ,y_train, y_test , X_train, X_test