In [None]:
import sys
import os

# Add the parent directory to sys.path so local modules can be imported
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from _src.data_ingestion import SelectDataIngestor, ZipfileDataIngestor
from data_inspection import DataInspectorSelector, SummaryDataInspectionStrategy
from missing_values_analysis import MissingValueAnalyzer, MissingValuesAnalysis
from univariante_analysis import SelectorUnivarianeAnalyzer
from bivariante_analysis import SelectorBivarianeAnalyzer
from multivariante_analysis import SelectMUltivarianteAnalyzer, NumericalMultivarianteAnalysis

from _src.feature_engineering import selectFeatureEngineeringStrtegy, x_FetureEngineering, y_FetureEngineering
from _src.data_splitting import SelectSplitter, TestTrainSplit

In [None]:
# data ingestion
ingestor = SelectDataIngestor()
ingestor.set_ingestor(ZipfileDataIngestor)
df = ingestor.execute_ingestor('data/zip_data/archive.zip')


In [None]:
# data inspection and summary
inspector = DataInspectorSelector()
inspector.set_strategy(SummaryDataInspectionStrategy)
inspector.execute_stratgy(df)

DATA SUMMARY

The dataset contains 13 distinct features with 6 of them being numeric and 7 of them being objects.
1. numeric features include : price, area, bedrooms, stories, parking  
2. categorical features include : mainroom, guestroom, basement, hotwaterheating, airconditionning, prefarea, furnishingstatus
3. the data set has it target feature to be "price"


In [None]:
# missing values and duplicates
analyzer = MissingValueAnalyzer()
analyzer.set_analyzer(MissingValuesAnalysis)
analyzer.execute_analyzer(df)

MISSING VALUES AND DUPLICATES

Our data set has no missing values and no duplicates.

In [None]:
# univariante analysis
analyzer = SelectorUnivarianeAnalyzer()
columns = ['price','area','bedrooms','bathrooms','stories','mainroad','guestroom','basement','hotwaterheating','airconditioning','parking','prefarea','furnishingstatus']
for column in columns:
    col_type = df[column].dtype
    analyzer.execute_analyzer(df, column, col_type)

UNIVARIANTE ANALYSIS ON TEH DATASET

From the graph (histogram) plotting, and with the use of pandas skew function most of numerical feature is skew to the right. 
1. The bedroom only has a value of skewness close to the normal skewness, skewness < 0.5
2. The rest of the numeric features has skewness further away from the normal skewness, skewness > 0.8

We will tackle this skewness:
1. we will leverage otheer methods like the log function to reshape them to a normally skewed, to avoid bias in the model 
2. we will also performed feature engineering with most of the object featured since they contain only 2 or at mosst 3 unique values

some of the skewed variables haviing skewness above '1.2' include
1. price variable
2. area variable
3. bathrooms variable

In [None]:
# fix skewness
strategy = selectFeatureEngineeringStrtegy()
strategy.set_strategy(PowerTransformation)
df = strategy.execute_strategy(df=df,columns=['price','area','parking','stories','bathrooms'])

In [None]:
# univariante analysis ---- second test
analyzer = SelectorUnivarianeAnalyzer()
columns = ['price','area','bedrooms','bathrooms','stories','mainroad','guestroom','basement','hotwaterheating','airconditioning','parking','prefarea','furnishingstatus']
for column in columns:
    col_type = df[column].dtype
    analyzer.execute_analyzer(df, column, col_type)

In [None]:
# one hot encoding
strategy = selectFeatureEngineeringStrtegy()
strategy.set_strategy(OneHotEncodingTransformation)
df = strategy.execute_strategy(df=df,columns=['mainroad','guestroom','airconditioning','hotwaterheating','basement','prefarea','furnishingstatus'])

In [None]:
# handle outliers
strategy = selectFeatureEngineeringStrtegy()
strategy.set_strategy(RobustScalerTransformation)
df = strategy.execute_strategy(df=df,columns=['price','area','parking','stories','bathrooms','bedrooms'])

In [None]:
# now lets standardize the data using standard scaling
strategy = selectFeatureEngineeringStrtegy()
strategy.set_strategy(StandardScalerTransformation)
df = strategy.execute_strategy(df=df)

In [None]:
# bivariante analysis
analyzer = SelectorBivarianeAnalyzer()
columns = df.columns.tolist()
for column in columns:
    col_type = df[column].dtype
    analyzer.execute_analyzer(df, column, "price", col_type)

BIVARIANTE ANALYSIS

Almost all the features with respect to 'price' have a lot of outleirs. This if not checked will affect the model performance.

SOLUTION

we will explore the outlers and decide to eliminate our trnsform them.

In [None]:
df

In [None]:
# multivariante analysis
analyzer = SelectMUltivarianteAnalyzer()
analyzer.set_analyzer(NumericalMultivarianteAnalysis)
analyzer.execute_nalyzer(df)

MULTIVARIANCE ANALSIS

we would check for the existens of multicollinearity, that is if some pair of variables have very high correlation between them which could cause over fitting or model problem.

1. the correlation value between all our variables is between 0 and 0.6 indicating we have a good correlation for our model
2. THe pair plot indicates outliers between area and price variables, which we need to address