<a href="https://colab.research.google.com/github/tharun-2001-talukolla/linear-regression/blob/main/linear_regression_internship.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np


In [None]:
# Load the dataset
try:
    df = pd.read_csv('/content/train.csv')
except FileNotFoundError:
    print("Error: train.csv not found. Please ensure the file is in the correct directory.")
    exit()

In [None]:
# Select relevant features and the target variable
features = ['GrLivArea', 'BedroomAbvGr', 'FullBath', 'HalfBath']
target = 'SalePrice'


In [None]:
# Check for missing values in selected columns and handle them
# For simplicity, we'll fill missing numerical values with the mean.
# In a real-world scenario, more sophisticated imputation might be needed.
for col in features:
    if df[col].isnull().any():
        if pd.api.types.is_numeric_dtype(df[col]):
            df[col] = df[col].fillna(df[col].mean())
        else:
            # For non-numeric, if any, fill with mode or a placeholder
            df[col] = df[col].fillna(df[col].mode()[0])
if df[target].isnull().any():
    df[target] = df[target].fillna(df[target].mean())

In [None]:
# Define X (features) and y (target)
X = df[features]
y = df[target]

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize the Linear Regression model
model = LinearRegression()

In [None]:
# Train the model
model.fit(X_train, y_train)


In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print("Linear Regression Model Results:")
print("--------------------------------")
print("\nModel Coefficients:")
for feature, coef in zip(features, model.coef_):
    print(f"- {feature}: {coef:.2f}")
print(f"- Intercept: {model.intercept_:.2f}")
print("\nModel Evaluation:")
print(f"- Mean Squared Error (MSE): {mse:.2f}")
print(f"- Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"- R-squared (R2): {r2:.2f}")

Linear Regression Model Results:
--------------------------------

Model Coefficients:
- GrLivArea: 101.73
- BedroomAbvGr: -26704.01
- FullBath: 30848.03
- HalfBath: 4611.65
- Intercept: 52830.65

Model Evaluation:
- Mean Squared Error (MSE): 2810942965.22
- Root Mean Squared Error (RMSE): 53018.33
- R-squared (R2): 0.63


In [None]:
# Example prediction
print("\nExample Prediction:")
example_house = pd.DataFrame([[1800, 3, 2, 1]], columns=features) # GrLivArea=1800, BedroomAbvGr=3, FullBath=2, HalfBath=1
predicted_price = model.predict(example_house)
print(f"Predicted price for an 1800 sq ft house with 3 bedrooms, 2 full baths, and 1 half bath: ${predicted_price[0]:.2f}")


Example Prediction:
Predicted price for an 1800 sq ft house with 3 bedrooms, 2 full baths, and 1 half bath: $222144.94


In [None]:
# Load the test dataset
try:
    test_df = pd.read_csv('/content/test.csv')
except FileNotFoundError:
    print("Error: test.csv not found. Please ensure the file is in the correct directory.")
    exit()

In [None]:
# Define the same features used during training
features = ['GrLivArea', 'BedroomAbvGr', 'FullBath', 'HalfBath']
# Store the 'Id' column for the submission file
test_ids = test_df['Id']

In [None]:
# Select features from the test dataset
X_test_new = test_df[features]

In [None]:
# Handle missing values in the test features using the mean from the training data (or a pre-calculated mean)
# For a robust solution, you would typically save the imputer fitted on training data and reuse it.
# For this example, we'll re-calculate means from the test_df for simplicity, but be aware of potential data leakage.
# A better approach would be to use a pipeline or save the means/modes from the training set.
# Re-calculating means from the test_df for demonstration purposes.
# In a production scenario, these means should come from the training data.
for col in features:
    if X_test_new[col].isnull().any():
        if pd.api.types.is_numeric_dtype(X_test_new[col]):
            X_test_new[col] = X_test_new[col].fillna(X_test_new[col].mean())
        else:
            # For non-numeric, if any, fill with mode or a placeholder
            X_test_new[col] = X_test_new[col].fillna(X_test_new[col].mode()[0])
# Ensure the model is trained. This part assumes the 'model' object from the previous step is available.
# If running this script independently, you would need to load a pre-trained model or re-train it.
try:
    # Attempt to use the 'model' object from the previous execution context.
    # If this script is run standalone, 'model' might not be defined.
    # In a real application, you'd save and load the model (e.g., using joblib or pickle).
    if 'model' not in locals() or not isinstance(model, LinearRegression):
        print("Model not found or not trained. Please run the training script first.")
        # For demonstration, we'll quickly re-train a dummy model if not found.
        # In a real scenario, this would be a proper model loading.
        from sklearn.model_selection import train_test_split
        df_train = pd.read_csv('MultipleFiles/train.csv')
        features_train = ['GrLivArea', 'BedroomAbvGr', 'FullBath', 'HalfBath']
        target_train = 'SalePrice'
        for col in features_train:
            if df_train[col].isnull().any():
                if pd.api.types.is_numeric_dtype(df_train[col]):
                    df_train[col] = df_train[col].fillna(df_train[col].mean())
                else:
                    df_train[col] = df_train[col].fillna(df_train[col].mode()[0])
        df_train[target_train] = df_train[target_train].fillna(df_train[target_train].mean())
        X_train_re, _, y_train_re, _ = train_test_split(df_train[features_train], df_train[target_train], test_size=0.2, random_state=42)
        model = LinearRegression()
        model.fit(X_train_re, y_train_re)
        print(" (Model re-trained for demonstration purposes.)")
except NameError:
    print("Model object 'model' is not defined. Please ensure the training script was executed.")
    exit()

In [None]:
# Make predictions on the new test data
predictions = model.predict(X_test_new)

In [None]:
# Ensure predictions are non-negative (house prices cannot be negative)
predictions[predictions < 0] = 0

In [None]:
# Create a DataFrame for submission
submission_df = pd.DataFrame({'Id': test_ids, 'SalePrice': predictions})

In [None]:
# Display the first few predictions
print("\nPredictions for the Test Dataset (first 5 rows):")
print(submission_df.head())


Predictions for the Test Dataset (first 5 rows):
     Id      SalePrice
0  1461  121423.030985
1  1462  143380.870622
2  1463  204748.668874
3  1464  202205.354725
4  1465  191336.364775


In [None]:
import os

# Create the directory if it doesn't exist
output_dir = 'MultipleFiles'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Created directory: {output_dir}")

Created directory: MultipleFiles


In [None]:
# Save the predictions to a CSV file
output_filename = 'MultipleFiles/test_predictions.csv'
submission_df.to_csv(output_filename, index=False)

In [None]:
print(f"\nPredictions saved to {output_filename}")


Predictions saved to MultipleFiles/test_predictions.csv
