# House Prices Prediction

## Importing:

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

# get_description() is a function to get the description of any column name or a value.
# It takes the column/value name, or any list of them:
from data_description import get_description

%matplotlib inline
sns.set_style('darkgrid')

### Importing the 'train.csv' and 'test.csv' files:

In [2]:
train = pd.read_csv('data/train.csv', index_col='Id')
test = pd.read_csv('data/test.csv', index_col='Id')

In [4]:
# X_train is the training features data
X_train = train.drop(['SalePrice'], axis=1)
# y is the target variable [SalePrice]
y = pd.DataFrame(train['SalePrice'])

# X is the combination of the features data from 'train' and 'test'. This is where we'll deal with the NaV values.
X = pd.concat([X_train, test], axis=0)


## Exploratory Data Analysis

Explore how many datatypes we have in General:

In [None]:
types =[]
for x in X.columns:
    types.append(X[x].dtype)
print(np.unique(types))



### Dividing the features into categorical and numerical:

In [12]:
# categorical = [x for x in X.columns if X[x].dtype == 'object']
# numerical = [x for x in X.columns if X[x].dtype != 'object']

# print(f'Numerical variables [{len(numerical)}]:\n{numerical}')
# print()
# print(f'Categorical variables [{len(categorical)}]:\n{categorical}')

[dtype('int64') dtype('float64') dtype('O')]


### The features sliced into 5 sets:

In [5]:
feat_set1 = X.columns[0:16].tolist()
feat_set1

['MSSubClass',
 'MSZoning',
 'LotFrontage',
 'LotArea',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle']

In [6]:
feat_set2 = X.columns[16:32].tolist()
feat_set2

['OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'MasVnrArea',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure']

In [7]:
feat_set3 = X.columns[32:48].tolist()
feat_set3

['BsmtFinType1',
 'BsmtFinSF1',
 'BsmtFinType2',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath']

In [8]:
feat_set4 = X.columns[48:64].tolist()
feat_set4

['FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'KitchenQual',
 'TotRmsAbvGrd',
 'Functional',
 'Fireplaces',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageCars',
 'GarageArea',
 'GarageQual',
 'GarageCond']

In [9]:
feat_set4 = X.columns[64:80].tolist()
feat_set4

['PavedDrive',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'MiscVal',
 'MoSold',
 'YrSold',
 'SaleType',
 'SaleCondition']

### Analysis:

## Preparing the Data:

### Missing Values:

In [14]:
missing_count = X.isna().sum(axis=0)
missing_percent = missing_count / len(X) * 100

missing_data = pd.DataFrame({
    'NaN Count': missing_count,
    'Percentage [%]': missing_percent
}).sort_values(by='NaN Count', ascending=False)
missing_data.index.name = 'Column Name'

missing_data = missing_data[missing_data['NaN Count'] > 0]
missing_data

Unnamed: 0_level_0,NaN Count,Percentage [%]
Column Name,Unnamed: 1_level_1,Unnamed: 2_level_1
PoolQC,2909,99.657417
MiscFeature,2814,96.402878
Alley,2721,93.216855
Fence,2348,80.438506
FireplaceQu,1420,48.646797
LotFrontage,486,16.649538
GarageYrBlt,159,5.447071
GarageFinish,159,5.447071
GarageQual,159,5.447071
GarageCond,159,5.447071
