# Project: Predicting Apartment Prices in Mexico City

## Import Libraries

In [None]:
from glob import glob
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from category_encoders import OneHotEncoder
import plotly.express as px
import plotly.graph_objects as go
from IPython.display import VimeoVideo
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.utils.validation import check_is_fitted

## Prepare Data

## Import

`wrangle` function that takes the name of a CSV file as input and returns a DataFrame. The function do the following steps:

1. Subset the data in the CSV file and return only apartments in Mexico City (`"Distrito Federal"`) that cost less than \$100,000.
2. Remove outliers by trimming the bottom and top 10\% of properties in terms of `"surface_covered_in_m2"`.
3. Create separate `"lat"` and `"lon"` columns.
4. Mexico City is divided into [16 boroughs](https://en.wikipedia.org/wiki/Boroughs_of_Mexico_City). Create a `"borough"` feature from the `"place_with_parent_names"` column.
5. Drop columns that are more than 50\% null values.
6. Drop columns containing low- or high-cardinality categorical values. 
7. Drop any columns that would constitute leakage for the target `"price_aprox_usd"`.
8. Drop any columns that would create issues of multicollinearity. 

In [None]:
# Build wrangle` function
def wrangle(filepath):
    # Read CSV file
    df = pd.read_csv(filepath)

    # Subset data: Apartments in "Capital Federal", less than 100,000
    mask_ba = df["place_with_parent_names"].str.contains("Distrito Federal")
    mask_apt = df["property_type"] == "apartment"
    mask_price = df["price_aprox_usd"] < 100_000
    df = df[mask_ba & mask_apt & mask_price]

    # Subset data: Remove outliers for "surface_covered_in_m2"
    low, high = df["surface_covered_in_m2"].quantile([0.1, 0.9])
    mask_area = df["surface_covered_in_m2"].between(low, high)
    df = df[mask_area]

    # Split "lat-lon" column
    df[["lat", "lon"]] = df["lat-lon"].str.split(",", expand=True).astype(float)
    df.drop(columns="lat-lon", inplace=True)

    # Get place name
    df["borough"] = df["place_with_parent_names"].str.split("|", expand=True)[1]
    df.drop(columns="place_with_parent_names", inplace=True)
    
    #drop features with high null values
    df.drop(columns = ["floor", "expenses"], inplace=True)   
    
    #drop low and high cardinality categorical variables
    df.drop(columns=["operation", "property_type", "currency", "properati_url"], inplace = True)
    
    #drop leaky columns
    df.drop(columns=["price","price_aprox_local_currency","price_usd_per_m2","price_per_m2"], inplace=True)

    #drop columns with multicolinearity
    df.drop(columns=["surface_total_in_m2", "rooms"], inplace =True)
    return df

In [None]:
## Use this cell to test your wrangle function and explore the data
#df.info()

In [None]:
# Use glob to create the list `files`. It contain the filenames of all the Mexico City real estate CSVs in the `./data` directory, except for `mexico-city-test-features.csv
files = glob("data/mexico-city-real-estate-*.csv")
files

In [None]:
# Combine wrangle` function, a list comprehension, and `pd.concat` to create a DataFrame `df`. It contain all the properties from the five CSVs in `files`.
frames = [wrangle(file) for file in files]
df = pd.concat(frames, ignore_index=True)
print(df.info())
df.head()

## Explore

In [None]:
# histogram showing the distribution of apartment prices (`"price_aprox_usd"`) in `df`.

In [None]:
df.isnull().sum()/len(df)

In [None]:
df.select_dtypes("object").nunique()

In [None]:
corr = df.select_dtypes("number").drop(columns="price_aprox_usd").corr()
sns.heatmap(corr) 
#should drop highly correlated features like area,rooms

In [None]:
# Plot distribution of price
plt.hist(df["price_aprox_usd"] )
plt.xlabel("Area [sq meters]")
plt.ylabel("Count")
plt.title("Distribution of Apartment Prices");

In [None]:
# scatter plot that shows apartment price (`"price_aprox_usd"`) as a function of apartment size (`"surface_covered_in_m2"`)
# Plot price vs area
plt.scatter(df["surface_covered_in_m2"], df["price_aprox_usd"])
plt.xlabel("Area [sq meters]")
plt.ylabel("Price [USD]")
plt.title("Mexico City: Price vs Area");

In [None]:
# Plot Mapbox scatter plot that shows the location of the apartments in your dataset and represent their price using color
fig = px.scatter_mapbox(
    df,  # Our DataFrame
    lat= "lat",
    lon= "lon",
    width=600,  # Width of map
    height=600,  # Height of map
    color= "price_aprox_usd",
    hover_data=["price_aprox_usd"],  # Display price when hovering mouse over house
)

fig.update_layout(mapbox_style="open-street-map")

fig.show()

## Split

In [None]:
# Split data into feature matrix `X_train` and target vector `y_train`.

target = "price_aprox_usd"
y_train = df[target]
features = ["surface_covered_in_m2", "lat", "lon", "borough"]
X_train = df[features]

## Build Model

### BaseLine

In [None]:
y_mean = y_train.mean()
y_pred_baseline = [y_mean] * len(y_train)
baseline_mae = mean_absolute_error(y_train, y_pred_baseline)
print("Mean apt price:", y_mean)
print("Baseline MAE:", baseline_mae)

### Iterate

In [None]:
# Build Model
model = make_pipeline(OneHotEncoder(use_cat_names=True),
                      SimpleImputer(),
                      Ridge()
)
# Fit model
model.fit(X_train,y_train)

## Evaluate

In [None]:
X_test = pd.read_csv("data/mexico-city-test-features.csv")
print(X_test.info())
X_test.head()

In [None]:
y_test_pred = pd.Series(model.predict(X_test))
y_test_pred.head()

## Communicate Results

In [None]:
intercept = model.named_steps["ridge"].intercept_
coefficients = model.named_steps["ridge"].coef_
features = model.named_steps["onehotencoder"].get_feature_names()
feat_imp = pd.Series(coefficients, index=features)
feat_imp

### 10 most Influencial coeffiicient for model

In [None]:
# Create horizontal bar chart
feat_imp.sort_values(key=abs).tail(10).plot(kind="barh")
plt.xlabel("Importance [USD]")
plt.ylabel("Feature")
plt.title("Feature Importance for Apartment Price")
# Don't delete the code below 👇
plt.savefig("images/2-5-14.png", dpi=150)
