# 0.0 Imports 

In [8]:
import docutils.transforms.peps
import geopandas
import streamlit as st
import pandas as pd
import numpy as np
import folium
import plotly.express as px

from streamlit_folium  import folium_static
from folium.plugins    import MarkerCluster
from datetime import datetime, time

# 0.1 Helper Functions

# 0.2 Loading Data 

In [13]:
df_raw = pd.read_csv(r"C:\Users\ferki\repos\project_house_rocket\data\kc_house_data.csv", low_memory=False )

# 1.0 Data Description

In [14]:
df1 = df_raw.copy()

## 1.2 Data Dimensions

In [15]:
print( 'Number of Rows: {}'.format( df1.shape[0] ) )
print( 'Number of Cols: {}'.format( df1.shape[1] ) )

Number of Rows: 21613
Number of Cols: 21


## 1.3 Data Types

In [16]:
df1['date'] = pd.to_datetime( df1['date'] )
df1.dtypes

id                        int64
date             datetime64[ns]
price                   float64
bedrooms                  int64
bathrooms               float64
sqft_living               int64
sqft_lot                  int64
floors                  float64
waterfront                int64
view                      int64
condition                 int64
grade                     int64
sqft_above              float64
sqft_basement             int64
yr_built                  int64
yr_renovated              int64
zipcode                   int64
lat                     float64
long                    float64
sqft_living15             int64
sqft_lot15                int64
dtype: object

## 1.4 Check NA

In [17]:
df1.isna().sum()

id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       2
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

## 1.5 Fillout NA

In [19]:
df1.isna().sum()

id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       2
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

## 1.6 Descriptive Statistical

In [20]:
num_attributes = df1.select_dtypes( include=['int', 'float'] )
cat_attributes = df1.select_dtypes( exclude=['int', 'float', 'datetime64[ns]'] )

### 1.7.1 Numerical Attributes

In [22]:
# Central Tendency - mean, median
ct1 = pd.DataFrame( num_attributes.apply( np.mean ) ).T
ct2 = pd.DataFrame( num_attributes.apply( np.median ) ).T

# Dispersion - std, min, max, range, skew, kurtosis
d1 = pd.DataFrame( num_attributes.apply( np.std ) ).T
d2 = pd.DataFrame( num_attributes.apply( min ) ).T
d3 = pd.DataFrame( num_attributes.apply( max ) ).T
d4 = pd.DataFrame( num_attributes.apply( lambda x: x.max() - x.min() ) ).T
d5 = pd.DataFrame( num_attributes.apply( lambda x: x.skew() ) ).T
d6 = pd.DataFrame( num_attributes.apply( lambda x: x.kurtosis() ) ).T

# Concatenete
m = pd.concat( [d2, d3, d4, ct1, ct2, d1, d5, d6] ).T.reset_index()
m.columns = ['attributes', 'min', 'max', 'range', 'mean', 'median', 'std', 'skew', 'kurtosis']

m

Unnamed: 0,attributes,min,max,range,mean,median,std,skew,kurtosis
0,id,1000102.0,9900000000.0,9899000000.0,4580302000.0,3904930000.0,2876499000.0,0.243329,-1.260542
1,price,75000.0,7700000.0,7625000.0,540088.1,450000.0,367118.7,4.024069,34.58554
2,bedrooms,0.0,33.0,33.0,3.370842,3.0,0.9300403,1.9743,49.063653
3,bathrooms,0.0,8.0,8.0,2.114757,2.25,0.7701453,0.511108,1.279902
4,sqft_living,290.0,13540.0,13250.0,2079.9,1910.0,918.4196,1.471555,5.243093
5,sqft_lot,520.0,1651359.0,1650839.0,15106.97,7618.0,41419.55,13.060019,285.07782
6,floors,1.0,3.5,2.5,1.494309,1.5,0.5399764,0.616177,-0.484723
7,waterfront,0.0,1.0,1.0,0.007541757,0.0,0.0865152,11.385108,127.632494
8,view,0.0,4.0,4.0,0.2343034,0.0,0.7662998,3.39575,10.893022
9,condition,1.0,5.0,4.0,3.40943,3.0,0.650728,1.032805,0.525764


In [None]:
# sns.distplot( df1['column'], kde=False  )

### 1.7.2 Categorical Attributes

In [23]:
cat_attributes.apply( lambda x: x.unique().shape[0] )

Series([], dtype: float64)

# 2.0 Feature Engineering 

In [24]:
df2 = df1.copy()

## 2.1 Hypotheses Mind Map

In [None]:
# Image('img/MindMapHypothesis.png')

## 2.2 Hypotheses Building

### 2.2.1 Hypotheses

### 2.2.2 Hypotheses

### 2.2.3 Hypotheses

## 2.3 Final List of Hypotheses

## 2.4 Feature Engineering

# 3.0 Variable Filtering

## 3.1 Line Filtering

## 3.2 Columns Selection

# 4.0 Exploratory Data Analysis

## 4.1 Univariate Analysis

### 4.1.1 Response Variable

### 4.1.2 Numerical Variable

### 4.1.3 Categorical Variable

## 4.2 Bivariate Analysis

## H1. .......

### Verdade ou Falso e motivo

## H2. .......


### Verdade ou Falso e motivo

# 4.3 Multivariate Analysis