In [None]:
#Step - Import libraries required for various operations
# Data manipulation
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats # For statistics like skewness and kurtosis

# visualizations
%matplotlib inline
sns.set_style('darkgrid') # grid style setting other options 'whitegrid', 'dark', 'white', and 'ticks'
plt.style.use('fivethirtyeight') # Another popular style

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Optional: Display settings for pandas
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

print("Python libraries imported successfully")


Python libraries imported successfully


In [None]:
# Step - Read the data, for this project data is loaded in collab local repo
try:
    df = pd.read_csv('/content/finalProjectReportDataset.csv')
    print("Training data loaded successfully.")
    print(f"Dataset shape: {df.shape}")
except FileNotFoundError:
    print("Dataset not found. Please ensure the file is in the correct directory.")
    # Handle error or exit if the file is essential
    df = None # Set to None if loading failed


Training data loaded successfully.
Dataset shape: (2197, 82)


In [None]:
# Step - Data Inspection
if  df is not None:
    # Display the first 5 rows
    print("\nFirst 5 rows of the dataset:")
    print(df.head())

    # Display basic information (non-null counts, data types)
    print("\nDataset Info:")
    df.info()

    # Display descriptive statistics for numerical columns
    print("\nDescriptive Statistics (Numerical Features):")
    # Include 'all' to get stats for object columns too (like count, unique, top, freq)
    print(df.describe(include='all'))

    # Check for missing values
    print("\nMissing Values:")
    print(df.isnull().sum())

    # Check for duplicate rows
    print("\nDuplicate Rows:")
    print(df.duplicated().sum())

else:
    print("\nSkipping initial inspection due to data loading issues.")



First 5 rows of the dataset:
   Order        PID  MS SubClass MS Zoning  Lot Frontage  Lot Area Street  \
0    534  531363010           20        RL          80.0      9605   Pave   
1    803  906203120           20        RL          90.0     14684   Pave   
2    956  916176030           20        RL           NaN     14375   Pave   
3    460  528180130          120        RL          48.0      6472   Pave   
4    487  528290030           80        RL          61.0      9734   Pave   

  Alley Lot Shape Land Contour Utilities Lot Config Land Slope Neighborhood  \
0   NaN       Reg          Lvl    AllPub     Corner        Gtl      SawyerW   
1   NaN       IR1          Lvl    AllPub    CulDSac        Gtl      SawyerW   
2   NaN       IR1          Lvl    NoSeWa    CulDSac        Gtl       Timber   
3   NaN       Reg          Lvl    AllPub     Inside        Gtl      NridgHt   
4   NaN       IR1          Lvl    AllPub     Inside        Gtl      Gilbert   

  Condition 1 Condition 2 Bldg T

In [None]:
#Step - Explore the data and targetbvariable price using a histogram plot
if  df is not None:

    print(df.info())
    plt.figure(figsize=(4,2))
    sns.histplot(df)
    plt.title('Distribution of SalePrice')
    plt.xlabel('Sale Price ($)')
    plt.ylabel('Frequency')
    plt.show()
    print('finished hist')
    # Calculate and print skewness and kurtosis
    #print(f"Skewness: {df.skew():.2f}")
    #print(f"Kurtosis: {df.kurt():.2f}")
else:
    print("\nSalePrice analysis skipped.")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2197 entries, 0 to 2196
Data columns (total 82 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Order            2197 non-null   int64  
 1   PID              2197 non-null   int64  
 2   MS SubClass      2197 non-null   int64  
 3   MS Zoning        2197 non-null   object 
 4   Lot Frontage     1835 non-null   float64
 5   Lot Area         2197 non-null   int64  
 6   Street           2197 non-null   object 
 7   Alley            143 non-null    object 
 8   Lot Shape        2197 non-null   object 
 9   Land Contour     2197 non-null   object 
 10  Utilities        2197 non-null   object 
 11  Lot Config       2197 non-null   object 
 12  Land Slope       2197 non-null   object 
 13  Neighborhood     2197 non-null   object 
 14  Condition 1      2197 non-null   object 
 15  Condition 2      2197 non-null   object 
 16  Bldg Type        2197 non-null   object 
 17  House Style   