In [1]:
# Análise Exploratória de Dados (EDA)
# Projeto: Kaggle House Prices
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import warnings
warnings.filterwarnings("ignore")


In [2]:
df = pd.read_csv("C:/house-price-analysis-ml/data/raw/train.csv")

df.head()
df.shape
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [3]:
df["SalePrice"].describe()

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

In [4]:
px.histogram(
    df, x="SalePrice", 
    nbins=50, 
    title="Distribuição do preço das casas"
)

In [None]:
def price_segment(price):
    
    if price <= 100000:
        return "Preço abaixo da média"
    elif price <=200000:
        return "Preço médio"
    else:
        return "Preço acima da média"
    
df["price_segment"] = df["SalePrice"].apply(price_segment)

In [6]:
df["price_segment"].value_counts()

price_segment
Preço médio              910
Preço acima da média     427
Preço abaixo da média    123
Name: count, dtype: int64

In [7]:
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns

In [8]:
numeric_cols = numeric_cols.drop("SalePrice")

In [9]:
grouped_means = (
    df.groupby("price_segment")[numeric_cols].mean().T
)
grouped_means

price_segment,Preço abaixo da média,Preço acima da média,Preço médio
Id,698.747967,698.393443,749.857143
MSSubClass,66.01626,50.175644,58.818681
LotFrontage,55.389381,81.264957,66.956581
LotArea,7110.349593,13988.313817,9348.336264
OverallQual,4.398374,7.519906,5.662637
OverallCond,4.99187,5.386417,5.742857
YearBuilt,1941.203252,1992.140515,1965.537363
YearRemodAdd,1964.390244,1999.145199,1980.932967
MasVnrArea,30.886179,198.947867,69.23484
BsmtFinSF1,151.536585,634.751756,393.446154


In [12]:
main_features = [
    "OverallQual",
    "GrLivArea",
    "GarageCars",
    "GarageArea",
    "YearBuilt",
    "YearRemodAdd",
    "TotalBsmtSF",
    "FullBath",
    "BedroomAbvGr",
    "WoodDeckSF",
    "LotArea",
    "LotFrontage"
]


In [13]:
for col in main_features:
    fig = px.box(
        df, x="price_segment", y=col, title=f"{col} por Faixa de preço"
    )
    fig.show()

In [20]:
#Quais fatores diferenciam uma casa barata de uma casa cara?
#"Abaixo da média" = AbM
#"Acima da média" = AcM

grouped_means = df.groupby("price_segment")[main_features].mean().T
comparison = grouped_means[["Preço abaixo da média", "Preço acima da média"]]
comparison ["diff_percent"] = (
    (comparison["Preço acima da média"] - comparison["Preço abaixo da média"]) / comparison["Preço abaixo da média"]
) * 100

comparison.sort_values("diff_percent", ascending=False)

price_segment,Preço abaixo da média,Preço acima da média,diff_percent
WoodDeckSF,30.01626,142.709602,375.440981
GarageArea,234.934959,643.864169,174.060604
GarageCars,0.894309,2.360656,163.964232
TotalBsmtSF,609.788618,1393.077283,128.45249
GrLivArea,995.422764,1983.981265,99.310417
LotArea,7110.349593,13988.313817,96.731731
FullBath,1.113821,1.978923,77.669704
OverallQual,4.398374,7.519906,70.970144
LotFrontage,55.389381,81.264957,46.715772
BedroomAbvGr,2.341463,3.025761,29.225215


In [33]:
#Filtragem da casa grande 313 com preço médio so pra ver de qual é

df[
    (df["price_segment"] == "Preço médio") & (df["GrLivArea"] > 5000)
][[
    "SalePrice", "GrLivArea", "YearBuilt", "OverallQual", "GarageCars", "Neighborhood", "GarageArea", "YearRemodAdd", "Id", "OverallCond", "SaleCondition"
]]

Unnamed: 0,SalePrice,GrLivArea,YearBuilt,OverallQual,GarageCars,Neighborhood,GarageArea,YearRemodAdd,Id,OverallCond,SaleCondition
1298,160000,5642,2008,10,2,Edwards,1418,2008,1299,5,Partial


In [34]:
df[
    df["OverallQual"] == 7
].groupby("price_segment")[main_features].mean()

Unnamed: 0_level_0,OverallQual,GrLivArea,GarageCars,GarageArea,YearBuilt,YearRemodAdd,TotalBsmtSF,FullBath,BedroomAbvGr,WoodDeckSF,LotArea,LotFrontage
price_segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Preço abaixo da média,7.0,1411.0,2.0,544.0,1977.0,1977.0,1386.0,2.0,3.0,192.0,11900.0,85.0
Preço acima da média,7.0,1884.186335,2.173913,583.850932,1991.248447,1998.279503,1243.484472,1.956522,3.068323,121.447205,13848.875776,79.053435
Preço médio,7.0,1520.751592,1.949045,480.477707,1985.286624,1994.44586,968.33758,1.828025,2.802548,79.318471,8226.821656,61.485075


In [36]:
px.box(df, x="Neighborhood", y="SalePrice", title="Distribuição de preços por bairro")