# Assignment 1: Housing in Brazil 

## Importing Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

## Prepare Data

### Import

In [None]:
df1 = pd.read_csv("data/brasil-real-estate-1.csv")
df1.head()

In [None]:
df1.shape

In [None]:
df1.info()

In [None]:
df1.head()

In [None]:
#drop Nan values
df1.dropna(inplace = True)

In [None]:
df1.shape

In [None]:
#split lat-lon column and transform from object to float
df1[["lat", "lon"]] = df1["lat-lon"].str.split(",",expand=True).astype(float)
df1.head()

In [None]:
df1.info()

In [None]:
# Create "state" col from "place_with_parent_names"
df1["state"] = df1["place_with_parent_names"].str.split("|", expand = True)[2]

In [None]:
#Transform price from object to float
df1["price_usd"] = df1["price_usd"].str.replace("$","", regex = False).str.replace(",","").astype(float)

In [None]:
#drop columns
df1.drop(columns = ["place_with_parent_names", "lat-lon"], inplace=True)
df1.head()

In [None]:
df2 = pd.read_csv("data/brasil-real-estate-2.csv")
df2.head()

In [None]:
df2.shape

In [None]:
df2.info()

In [None]:
df2.head()

In [None]:
df2["price_usd"] = (df2["price_brl"]/3.19)

In [None]:
#drop NaN Values
df2.dropna(inplace = True)
#drop price_mxn column
df2.drop(columns=["price_brl"], inplace=True)
df2.head()


In [None]:
# Concatenate `df1` and `df2` to create a new DataFrame named `df
= pd.concat([df1,df2])
print("df shape:", df.shape)

### Explore

In [None]:
fig = px.scatter_mapbox(
    df,
    lat="lat",
    lon="lon",
    center={"lat": -14.2, "lon": -51.9},  # Map will be centered on Brazil
    width=600,
    height=600,
    hover_data=["price_usd"],  # Display price when hovering mouse over house
)

fig.update_layout(mapbox_style="open-street-map")

fig.show()

In [None]:
summary_stats = df[["area_m2","price_usd"]].describe()
summary_stats

In [None]:
# Create a histogram of `"price_usd"
plt.hist(df["price_usd"])
plt.title("Distribution of Home Prices")
plt.xlabel("Price [USD]")
plt.ylabel("Frequency");

In [None]:
# Boxplot
plt.boxplot(df["area_m2"], vert=False)
plt.xlabel("Area [sq. meters]")
plt.title("Distribution of Home Sizes");

In [None]:
mean_price_by_region = df.groupby("region")["price_usd"].mean().sort_values(ascending=False)
mean_price_by_region

In [None]:
# Barchart 
mean_price_by_region.plot(kind = "bar",
                             xlabel="Region",
                             ylabel="Mean Price [USD]",
                             title = "Mean Home Price By Region"
                        );

In [None]:
# Create a DataFrame `df_south` that contains all the homes from `df` that are in the `"South"` region. 
df_south = df[df["region"] == "South"]
df_south.head()

In [None]:
homes_by_state = df_south["state"].value_counts()
homes_by_state

In [None]:
# Subset `df` to include only observations from `"Distrito Federal"`
df_Rio_Grande = df[df["state"]=="Rio Grande do Sul"]
plt.scatter(df_Rio_Grande["area_m2"], df_Rio_Grande["price_usd"])
plt.xlabel("Area [sq meters]")
plt.ylabel("Price [USD]")
plt.title("Rio Grande do Sul: Price vs. Area");

In [None]:
df_Santa_Catarina = df[df["state"]=="Santa Catarina"]
df_Paraná = df[df["state"]=="Paraná"]
p_correlation_Rio = df_Rio_Grande["area_m2"].corr(df_Rio_Grande["price_usd"])
p_correlation_Santa_Catarina = df_Santa_Catarina["area_m2"].corr(df_Santa_Catarina["price_usd"])
p_correlation_Paraná = df_Paraná["area_m2"].corr(df_Paraná["price_usd"])

In [None]:
#  Create a dictionary `south_states_corr`, where the keys are the names of the three states in the `"South"` region of Brazil, and their associated values are the correlation coefficient between `"area_m2"` and `"price_usd"` in that state.
south_states_corr = {'Rio Grande do Sul':p_correlation_Rio,
                    'Santa Catarina':p_correlation_Santa_Catarina,
                    'Paraná':p_correlation_Paraná}

south_states_corr