In [11]:


import numpy as np
import pandas as pd
import category_encoders as ce
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")


def preprocess(train_data_path: str, test_data_path: str):
    df_train = pd.read_csv(train_data_path)
    df_test = pd.read_csv(test_data_path)

    # create a new index column in df_train and df_test
    df_train['ind'] = 1
    df_test['ind'] = 0 

    # Concatenate train and test DataFrames
    combined_df = pd.concat([df_train, df_test])

    # drop location and title column from combined_df
    combined_df.dropna(subset=['loc','title'],inplace=True)

    # fill null values in combined_df
    median_bedroom = combined_df['bedroom'].median()
    combined_df['bedroom'].fillna(median_bedroom, inplace=True)

    median_bathroom = combined_df['bathroom'].median()
    combined_df['bathroom'].fillna(median_bathroom, inplace=True)

    median_parking_space = combined_df['parking_space'].median()
    combined_df['parking_space'].fillna(median_parking_space, inplace=True)

    # feature engineering

    combined_df["is_lagos"] = combined_df["loc"].apply(lambda x: 1 if x == "Lagos" else 0)

    combined_df["is_mansion"] = combined_df["title"].apply(
    lambda x: 1 if x == "Mansion" else 0
    )

    combined_df["comfort_ind"] = combined_df["bedroom"] / combined_df["bathroom"]

    combined_df["size"] = (
    combined_df["bedroom"] + combined_df["bathroom"] + combined_df["parking_space"]
    )

    combined_df["comfort_by_size"] = combined_df["comfort_ind"] * combined_df["size"]

    population_level_1 = ["Anambra", "Enugu", "Imo", "Lagos"]
    population_level_2 = ["Abia", "Kano", "Rivers"]
    population_level_3 = ["Akwa Ibom", "Ebonyi", "Ekiti", "Osun"]
    population_level_4 = ["Katsina", "Ogun", "Ondo"]
    population_level_5 = ["Bauchi", "Delta", "Jigawa", "Oyo"]
    population_level_6 = ["Bayelsa", "Edo", "Gombe"]
    population_level_7 = ["Cross River", "Kaduna", "Plateau", "Sokoto"]
    population_level_8 = ["Kebbi", "Kogi", "Zamfara"]
    population_level_9 = ["Adamawa", "Benue", "Kwara", "Nasarawa"]
    population_level_10 = ["Borno", "Niger", "Taraba", "Yobe"]

    combined_df["population_density_level"] = combined_df["loc"].apply(
        lambda x: 10
        if x in population_level_1
        else 9
        if x in population_level_2
        else 8
        if x in population_level_3
        else 7
        if x in population_level_4
        else 6
        if x in population_level_5
        else 5
        if x in population_level_6
        else 4
        if x in population_level_7
        else 3
        if x in population_level_8
        else 2
        if x in population_level_9
        else 1
        if x in population_level_10
        else 0
    )

    # Create a new DataFrame to store the encoded values
    encoded_df = combined_df.copy()

    # Define the encoder instance
    encoder = ce.TargetEncoder(cols=['loc'])

    # Fit the encoder on the 'loc' column and 'price' target variable
    encoder.fit(encoded_df['loc'], encoded_df['price'])

    # Transform the 'loc' column with the encoded values
    encoded_df['loc_encoded'] = encoder.transform(encoded_df['loc'])


    encoded_df['loc_encoded'] = round((encoded_df['loc_encoded']/10_000_000),2)
    # Drop the original 'loc' column if you no longer need it
    encoded_df.drop('loc', axis=1, inplace=True)


    # Calculate the average price for each title
    title_average_price = combined_df.groupby("title")["price"].mean()

    # Create a new DataFrame to store the title and its corresponding average price
    title_avg_price_df = pd.DataFrame(
        {"title": title_average_price.index, "avg_price": title_average_price.values}
    )

    # Sort the DataFrame by the average price in ascending order
    title_avg_price_df.sort_values(by="avg_price", inplace=True)

    # Create a new column 'title_rank' with the rank based on average price
    title_avg_price_df["title_rank"] = range(1, len(title_avg_price_df) + 1)

    # Merge the 'title_rank' column back to the original DataFrame based on 'title' column
    encoded_df = pd.merge(
        encoded_df, title_avg_price_df[["title", "title_rank"]], on="title", how="left"
    )


    # Drop the 'title_rank' column if you no longer need it
    encoded_df.drop("title", axis=1, inplace=True)

    combined_df = encoded_df.copy()

    train = combined_df[combined_df.ind == 1]
    test = combined_df[combined_df.ind == 0]

    train.drop('ind',axis=1,inplace=True)
    test.drop(['ind','price'],axis=1,inplace=True)

    X = train.drop(['ID','price'],axis=1)
    y = train.price


    return X, y




In [12]:
X, y = preprocess("data/Housing_dataset_train.csv", "data/Housing_dataset_test.csv")


In [13]:
X


Unnamed: 0,bedroom,bathroom,parking_space,is_lagos,is_mansion,comfort_ind,size,comfort_by_size,population_density_level,loc_encoded,title_rank
0,2.0,2.0,1.0,0,0,1.000000,5.0,5.000000,7,0.20,5
1,4.0,2.0,4.0,0,0,2.000000,10.0,20.000000,7,0.23,2
2,5.0,2.0,4.0,0,0,2.500000,11.0,27.500000,10,0.24,8
3,4.0,5.0,6.0,0,0,0.800000,15.0,12.000000,3,0.18,6
4,4.0,1.0,3.0,0,1,4.000000,8.0,32.000000,1,0.17,10
...,...,...,...,...,...,...,...,...,...,...,...
10526,8.0,1.0,6.0,0,0,8.000000,15.0,120.000000,1,0.19,8
10527,4.0,7.0,3.0,0,0,0.571429,14.0,8.000000,5,0.23,4
10528,4.0,7.0,5.0,0,0,0.571429,16.0,9.142857,4,0.19,2
10529,8.0,6.0,5.0,0,0,1.333333,19.0,25.333333,4,0.19,4


In [15]:
y

0        1149999.565
1        1672416.689
2        2410306.756
3        2600700.898
4        1341750.867
            ...     
10526    2837199.086
10527    2367927.861
10528    2228516.471
10529    2406812.693
10530    3348918.718
Name: price, Length: 10531, dtype: float64