# Data Preprocessing

## Data Loading

In [1]:
!pip install python-dotenv google-api-python-client

from google.colab import drive
drive.mount('/content/drive')

import os
from dotenv import load_dotenv
load_dotenv("/content/drive/MyDrive/Professional/Portfolio/test_split/.env_drive")
file_id = os.getenv("GDRIVE_FILE_ID")
load_dotenv("/content/drive/MyDrive/Professional/Portfolio/test_split/.env_github")
github_pat = os.getenv("GITHUB_PAT")

if not file_id:
    raise ValueError("❌ Error: 'GDRIVE_FILE_ID' is missing or invalid in your .env file.")
if not github_pat:
    raise ValueError("❌ Error: 'GITHUB_PAT' is missing or invalid in your .env file.")

!git clone https://{github_pat}@github.com/vmagdale2/data-split-model-performance-analysis.git

import sys
sys.path.append('/content/data-split-model-performance-analysis/scripts')
%cd /content/data-split-model-performance-analysis/scripts
!pwd
!ls

from utils import authenticate_and_load_env, load_data_from_drive

service = authenticate_and_load_env()

file_id = os.getenv("GDRIVE_FILE_ID")
if not file_id:
    raise ValueError("❌ Error: 'GDRIVE_FILE_ID' is missing or invalid in your .env file.")

df = load_data_from_drive(service, file_id)
print("✅ Data loaded successfully!" if df is not None else "❌ Failed to load data.")


Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1
Mounted at /content/drive
Cloning into 'data-split-model-performance-analysis'...
remote: Enumerating objects: 64, done.[K
remote: Counting objects: 100% (64/64), done.[K
remote: Compressing objects: 100% (45/45), done.[K
remote: Total 64 (delta 28), reused 46 (delta 13), pack-reused 0 (from 0)[K
Receiving objects: 100% (64/64), 102.16 KiB | 547.00 KiB/s, done.
Resolving deltas: 100% (28/28), done.
/content/data-split-model-performance-analysis/scripts
/content/data-split-model-performance-analysis/scripts
data_preprocessing.py  readme.md	    train_test_split.py  visualization.py
modeling.py	       scaling_analysis.py  utils.py
✅ Google Drive API authenticated successfully!
✅ Data loaded successfully!
✅ Data loaded successfully!


## Objectives
- Load the Ames Housing dataset
- Perform exploratory data analysis (EDA)
- Handle missing data
- Apply one-hot encoding for categorical features
- Save processed data for future steps

In [2]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1379 entries, 0 to 1378
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   1stFlrSF       1379 non-null   float64
 1   2ndFlrSF       1379 non-null   float64
 2   3SsnPorch      1379 non-null   float64
 3   Alley          82 non-null     object 
 4   BedroomAbvGr   1379 non-null   int64  
 5   BldgType       1379 non-null   object 
 6   BsmtCond       953 non-null    object 
 7   BsmtExposure   953 non-null    object 
 8   BsmtFinSF1     1379 non-null   float64
 9   BsmtFinSF2     1379 non-null   float64
 10  BsmtFinType1   953 non-null    object 
 11  BsmtFinType2   952 non-null    object 
 12  BsmtFullBath   1379 non-null   int64  
 13  BsmtHalfBath   1379 non-null   int64  
 14  BsmtQual       953 non-null    object 
 15  BsmtUnfSF      1379 non-null   float64
 16  CentralAir     1379 non-null   object 
 17  Condition1     1379 non-null   object 
 18  Conditio

In [3]:
display(df.head())

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,SalePrice
0,856.0,854.0,0.0,,3,1Fam,TA,No,706.0,0.0,...,0.0,Pave,8,856.0,AllPub,0.0,2003,2003,2008,208500.0
1,1262.0,0.0,0.0,,3,1Fam,TA,Gd,978.0,0.0,...,0.0,Pave,6,1262.0,AllPub,298.0,1976,1976,2007,181500.0
2,920.0,866.0,0.0,,3,1Fam,TA,Mn,486.0,0.0,...,0.0,Pave,6,920.0,AllPub,0.0,2001,2002,2008,223500.0
3,961.0,756.0,0.0,,3,1Fam,Gd,No,216.0,0.0,...,0.0,Pave,7,756.0,AllPub,0.0,1915,1970,2006,140000.0
4,1145.0,1053.0,0.0,,4,1Fam,TA,Av,655.0,0.0,...,0.0,Pave,9,1145.0,AllPub,192.0,2000,2000,2008,250000.0


In [4]:
df.dtypes.value_counts()

Unnamed: 0,count
object,43
float64,21
int64,16


In [5]:
import pandas as pd
from utils import handle_missing_values, one_hot_encode

In [8]:
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    non_numeric_values = df[col].apply(lambda x: isinstance(x, str)).sum()
    if non_numeric_values > 0:
        print(f"Column '{col}' has {non_numeric_values} non-numeric values.")

In [6]:
df = handle_missing_values(df)

In [7]:
df_encoded = one_hot_encode(df)

In [14]:
drive_path = '/content/drive/My Drive/Professional/Portfolio/test_split'
df_encoded.to_csv(f'{drive_path}/encoded_data.csv', index=False)
print(f"File successfully saved at: {drive_path}/encoded_data.csv")

File successfully saved at: /content/drive/My Drive/Professional/Portfolio/test_split/encoded_data.csv
