# Train and Test Split

## Data Loading

In [1]:
!pip install python-dotenv google-api-python-client

from google.colab import drive
drive.mount('/content/drive')

import os
from dotenv import load_dotenv
load_dotenv("/content/drive/MyDrive/Professional/Portfolio/test_split/.env_drive")
file_id = os.getenv("GDRIVE_FILE_ID")
load_dotenv("/content/drive/MyDrive/Professional/Portfolio/test_split/.env_github")
github_pat = os.getenv("GITHUB_PAT")

if not file_id:
    raise ValueError("❌ Error: 'GDRIVE_FILE_ID' is missing or invalid in your .env file.")
if not github_pat:
    raise ValueError("❌ Error: 'GITHUB_PAT' is missing or invalid in your .env file.")

!git clone https://{github_pat}@github.com/vmagdale2/data-split-model-performance-analysis.git

import sys
sys.path.append('/content/data-split-model-performance-analysis/scripts')
%cd /content/data-split-model-performance-analysis/scripts
!pwd
!ls

from utils import authenticate_and_load_env, load_data_from_drive

service = authenticate_and_load_env()

file_id = os.getenv("GDRIVE_FILE_ID")
if not file_id:
    raise ValueError("❌ Error: 'GDRIVE_FILE_ID' is missing or invalid in your .env file.")

df = load_data_from_drive(service, file_id)
print("✅ Data loaded successfully!" if df is not None else "❌ Failed to load data.")
display(df.head())

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1
Mounted at /content/drive
Cloning into 'data-split-model-performance-analysis'...
remote: Enumerating objects: 64, done.[K
remote: Counting objects: 100% (64/64), done.[K
remote: Compressing objects: 100% (45/45), done.[K
remote: Total 64 (delta 28), reused 46 (delta 13), pack-reused 0 (from 0)[K
Receiving objects: 100% (64/64), 102.16 KiB | 2.92 MiB/s, done.
Resolving deltas: 100% (28/28), done.
/content/data-split-model-performance-analysis/scripts
/content/data-split-model-performance-analysis/scripts
data_preprocessing.py  readme.md	    train_test_split.py  visualization.py
modeling.py	       scaling_analysis.py  utils.py
✅ Google Drive API authenticated successfully!
✅ Data loaded successfully!
✅ Data loaded successfully!


Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BsmtFinSF1,BsmtFinSF2,BsmtFullBath,BsmtHalfBath,BsmtUnfSF,EnclosedPorch,...,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,Street_Pave,Utilities_NoSeWa
0,856.0,854.0,0.0,3,706.0,0.0,1,0,150.0,0.0,...,False,False,False,False,False,False,False,True,True,False
1,1262.0,0.0,0.0,3,978.0,0.0,0,1,284.0,0.0,...,False,False,False,False,False,False,False,True,True,False
2,920.0,866.0,0.0,3,486.0,0.0,1,0,434.0,0.0,...,False,False,False,False,False,False,False,True,True,False
3,961.0,756.0,0.0,3,216.0,0.0,1,0,540.0,272.0,...,False,False,False,False,False,False,False,True,True,False
4,1145.0,1053.0,0.0,4,655.0,0.0,1,0,490.0,0.0,...,False,False,False,False,False,False,False,True,True,False


## Objective
- Create train/test splits for both encoded and non-encoded datasets
- Ensure reproducibility by setting a random seed

In [2]:
from utils import create_train_test_splits

In [3]:
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']
X_train, X_test, y_train, y_test = create_train_test_splits(X, y)

In [4]:
drive_path = '/content/drive/My Drive/Professional/Portfolio/test_split'
X_train.to_csv(os.path.join(drive_path, 'X_train.csv'), index=False)
X_test.to_csv( os.path.join(drive_path, 'X_test.csv'), index=False)
y_train.to_csv(os.path.join(drive_path, 'y_train.csv'), index=False)
y_test.to_csv(os.path.join(drive_path, 'y_test.csv'), index=False)