In [7]:
import warnings
from glob import glob

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import plotly.express as px
import wqet_grader
from category_encoders import OneHotEncoder
from IPython.display import VimeoVideo
from ipywidgets import Dropdown, FloatSlider, IntSlider, interact
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge  # noqa F401
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.utils.validation import check_is_fitted



In [None]:
# Build your `wrangle` function
def wrangle(filepath):
    df=pd.read_csv(filepath)
    
    #subset apartment
    mask_apt=df['property_type']=='apartment'
    
    #subset Capital Federal
    mask_ba=df['place_with_parent_names'].str.contains('Distrito Federal')
    
    
    #subset price usd<100000
    mask_price=df['price_aprox_usd']<100_000
    
    df = df[mask_ba & mask_apt & mask_price]
    #remove ooutliers
    low, high = df["surface_covered_in_m2"].quantile([0.1, 0.9])
    mask_area = df["surface_covered_in_m2"].between(low, high)
    
    df = df[mask_area]
    
    # Split "lat-lon" column
    df[["lat", "lon"]] = df["lat-lon"].str.split(",", expand=True).astype(float)
    df.drop(columns="lat-lon", inplace=True)
    
    #split borough
    df["borough"] = df['place_with_parent_names'].str.split("|",expand=True)[1]
    df.drop(columns='place_with_parent_names', inplace=True)
    
    #drop features with high null values
    df.drop(columns=["floor", "expenses", 'rooms','price_usd_per_m2','surface_total_in_m2'], 
            inplace=True)
    
    #drop low and high cardinality categorical features
    df.drop(columns=["operation", "property_type", 'currency', 'properati_url'], inplace=True)
    
    #Drop leaky columns
    df.drop(columns=['price','price_aprox_local_currency',
                     'price_per_m2'], inplace=True)
    
    #drop columns with multicolinearity
    return df

In [None]:
frame1 = wrangle('data/mexico-city-real-estate-1.csv')
print(frame1.info())
frame1.head()

In [None]:
# test your wrangle function and explore the data
df=pd.read_csv('data/mexico-city-real-estate-1.csv')

df.head()

In [None]:
#using glob to import all our files
files = glob('data/mexico-city-real-estate-*.csv')
files

In [None]:
frames = [wrangle(file) for file in files]
df = pd.concat(frames, ignore_index=True)
print(df.info())
df.head()

In [None]:
# Build histogram
plt.hist(df['price_aprox_usd'])


# Label axes
plt.xlabel("Price [$]")
plt.ylabel("Count")

# Add title

plt.title("Distribution of Apartment Prices")


In [None]:
# Build scatter plot
plt.scatter(x=df["surface_covered_in_m2"],y=df["price_aprox_usd"])


# Label axes
plt.xlabel("Area [sq meters]")
plt.ylabel("Price [USD]")

# Add title
plt.title("Mexico City: Price vs. Area")


In [None]:
# Plot Mapbox location and price
fig = fig = px.scatter_mapbox(
    df,  # Our DataFrame
    lat='lat',
    lon='lon',
    width=600,  # Width of map
    height=600,  # Height of map
    color='price_aprox_usd',
    hover_data=["price_aprox_usd"],  # Display price when hovering mouse over house
)

In [None]:
# Split data into feature matrix `X_train` and target vector `y_train`.
target = "price_aprox_usd"
y_train = df[target]
feature= ['surface_covered_in_m2', 'lat', 'lon', 'borough']
X_train =df[feature]

In [None]:
#build a baseline model
y_mean = y_train.mean()
y_pred_baseline = [y_mean]*len(y_train)
baseline_mae = mean_absolute_error(y_train, y_pred_baseline)
print("Mean apt price:", y_mean)
print("Baseline MAE:", baseline_mae)

In [None]:
# Build Model using pipeline
model = make_pipeline(
      OneHotEncoder(use_cat_names=True),
      SimpleImputer(),
      Ridge()

)
# Fit model
model.fit(X_train, y_train)



In [None]:
#import the data for testing
X_test = pd.read_csv('data/mexico-city-test-features.csv')
print(X_test.info())
X_test.head()

In [None]:
y_test_pred = pd.Series(model.predict(X_test))
y_test_pred.head()

In [None]:
#looking for coefficients to build the prediction function
coefficients =model.named_steps['ridge'].coef_
features = model.named_steps['onehotencoder'].get_feature_names()
feat_imp = pd.Series(coefficients, index=features)
feat_imp

In [None]:
# Build bar chart
feat_imp.sort_values(key=abs).tail(15).plot(kind='barh')

# Label axes
plt.xlabel('Importance USD')
plt.ylabel('Feature')
# Add title
plt.title('Feature Importance For apartment Price');
