"""
Objective:
- Fit a linear regression model on the training data
- Evaluate the model on train and test sets using MSE
"""

from sklearn.linear_model import LinearRegression
from utils import calculate_mse

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions and error calculation
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

train_mse = calculate_mse(y_train, y_train_pred)
test_mse = calculate_mse(y_test, y_test_pred)

print(f'Training MSE: {train_mse}')
print(f'Test MSE: {test_mse}')


# Data Modeling

## Data Loading

In [2]:
!pip install python-dotenv google-api-python-client

from google.colab import drive
drive.mount('/content/drive')

import os
from dotenv import load_dotenv
load_dotenv("/content/drive/MyDrive/Professional/Portfolio/test_split/.env_X_train")
load_dotenv("/content/drive/MyDrive/Professional/Portfolio/test_split/.env_X_test")
load_dotenv("/content/drive/MyDrive/Professional/Portfolio/test_split/.env_y_train")
load_dotenv("/content/drive/MyDrive/Professional/Portfolio/test_split/.env_y_test")
x_train_id = os.getenv("X_TRAIN_ID")
x_test_id = os.getenv("X_TEST_ID")
y_train_id = os.getenv("Y_TRAIN_ID")
y_test_id = os.getenv("Y_TEST_ID")

load_dotenv("/content/drive/MyDrive/Professional/Portfolio/test_split/.env_github")
github_pat = os.getenv("GITHUB_PAT")

if not x_train_id or not x_test_id or not y_train_id or not y_test_id:
    raise ValueError("❌ Error: One or more environment variables are missing or invalid.")
if not github_pat:
    raise ValueError("❌ Error: 'GITHUB_PAT' is missing or invalid in your .env file.")

!git clone https://{github_pat}@github.com/vmagdale2/data-split-model-performance-analysis.git

import sys
sys.path.append('/content/data-split-model-performance-analysis/scripts')
%cd /content/data-split-model-performance-analysis/scripts
!pwd
!ls

from utils import authenticate_and_load_env, load_data_from_drive

service = authenticate_and_load_env()

x_train_id = os.getenv("X_TRAIN_ID")
x_test_id = os.getenv("X_TEST_ID")
y_train_id = os.getenv("Y_TRAIN_ID")
y_test_id = os.getenv("Y_TEST_ID")

df_X_train = load_data_from_drive(service, x_train_id)
df_X_test = load_data_from_drive(service, x_test_id)
df_y_train = load_data_from_drive(service, y_train_id)
df_y_test = load_data_from_drive(service, y_test_id)

print("✅ Data loaded successfully!")

display(df_X_train.head())
display(df_X_test.head())
display(df_y_train.head())
display(df_y_test.head())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
fatal: destination path 'data-split-model-performance-analysis' already exists and is not an empty directory.
/content/data-split-model-performance-analysis/scripts
/content/data-split-model-performance-analysis/scripts
data_preprocessing.py  readme.md	    train_test_split.py  visualization.py
modeling.py	       scaling_analysis.py  utils.py
✅ Google Drive API authenticated successfully!
✅ Data loaded successfully!
✅ Data loaded successfully!
✅ Data loaded successfully!
✅ Data loaded successfully!
✅ Data loaded successfully!


Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BsmtFinSF1,BsmtFinSF2,BsmtFullBath,BsmtHalfBath,BsmtUnfSF,EnclosedPorch,...,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,Street_Pave,Utilities_NoSeWa
0,630.0,0.0,0.0,1,515.0,0.0,1,0,115.0,0.0,...,False,False,False,False,False,False,False,True,True,False
1,845.0,0.0,0.0,3,0.0,0.0,0,0,0.0,0.0,...,False,False,False,False,False,False,False,True,True,False
2,728.0,728.0,0.0,3,0.0,0.0,0,0,728.0,0.0,...,False,False,False,False,False,False,False,True,True,False
3,561.0,668.0,0.0,2,285.0,0.0,0,0,276.0,0.0,...,False,False,False,False,False,False,False,True,True,False
4,1601.0,0.0,0.0,3,1358.0,0.0,1,0,223.0,0.0,...,False,False,False,False,False,False,False,True,True,False


Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BsmtFinSF1,BsmtFinSF2,BsmtFullBath,BsmtHalfBath,BsmtUnfSF,EnclosedPorch,...,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,Street_Pave,Utilities_NoSeWa
0,1518.0,0.0,0.0,1,1218.0,0.0,0,0,300.0,0.0,...,False,False,False,False,False,False,False,True,True,False
1,925.0,0.0,0.0,2,338.0,466.0,0,1,121.0,0.0,...,False,False,False,False,False,False,False,True,True,False
2,1095.0,679.0,0.0,4,0.0,0.0,1,0,1095.0,90.0,...,False,False,False,False,False,False,False,True,True,False
3,888.0,868.0,0.0,3,742.0,0.0,1,0,130.0,0.0,...,False,False,False,False,False,False,False,True,True,False
4,1337.0,0.0,0.0,3,699.0,0.0,1,0,638.0,0.0,...,False,False,False,False,False,False,False,True,True,False


Unnamed: 0,SalePrice
0,86000.0
1,84000.0
2,176000.0
3,124000.0
4,272000.0


Unnamed: 0,SalePrice
0,274000.0
1,117500.0
2,87000.0
3,204000.0
4,185000.0


## Objective
- Fit a linear regression model on the training data
- Evaluate the model on train and test sets using MSE

In [3]:
from sklearn.linear_model import LinearRegression
from utils import calculate_mse

In [4]:
model = LinearRegression()
model.fit(df_X_train, df_y_train)

In [5]:
y_train_pred = model.predict(df_X_train)
y_test_pred = model.predict(df_X_test)

In [6]:
train_mse = calculate_mse(df_y_train, y_train_pred)
test_mse = calculate_mse(df_y_test, y_test_pred)

In [7]:
print(f'Training MSE: {train_mse}')
print(f'Test MSE: {test_mse}')

Training MSE: 344226561.92392045
Test MSE: 2845129573.53417
