In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.datasets import fetch_california_housing

# Load the California housing dataset
housing = (as_frame=True)
data = housing.frame

# Display dataset information
print("Dataset Head:")
print(data.head())
print("\nDataset Info:")
print(data.info())

# Define features (X) and target (y)
X = data.drop(['MedHouseVal'], axis=1)  # Features
y = data['MedHouseVal']  # Target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nModel Performance:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

# Example usage: Predicting house price for new data
example_data = pd.DataFrame({
    'MedInc': [8.0],        # Median Income
    'HouseAge': [30.0],     # Median House Age
    'AveRooms': [6.0],      # Average Rooms
    'AveBedrms': [1.2],     # Average Bedrooms
    'Population': [1000.0], # Population
    'AveOccup': [3.0],      # Average Occupants
    'Latitude': [34.0],     # Latitude
    'Longitude': [-118.0]   # Longitude
})

predicted_price = model.predict(example_data)
print(f"\nPredicted House Price for example data: ${predicted_price[0]*100000:.2f}")


Dataset Head:
   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedHouseVal  
0    -122.23        4.526  
1    -122.22        3.585  
2    -122.24        3.521  
3    -122.25        3.413  
4    -122.25        3.422  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-

In [43]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
print(os.getcwd())

# Load the dataset
#data_path = '/mnt/data/House Price Prediction Dataset.csv'  # Update the file path as needed
data = pd.read_csv(r'C:\Users\varik\Downloads\House Price Prediction Dataset.csv')

# Display basic information about the dataset
print("Dataset Head:")
print(data.head())
print("\nDataset Info:")
print(data.info())

# Handle missing values if any
data = data.dropna()

# Define features (X) and target (y)
# Assuming the dataset has 'Price' as the target variable, and the rest are features
X = data.drop(['Price'], axis=1)  # Replace 'Price' with the actual column name if different
y = data['Price']

# Convert categorical columns to numerical using one-hot encoding (if applicable)
X = pd.get_dummies(X, drop_first=True)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test, y_pred)st, y_pre

print("\nModel Performance:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

# Example usage: Predicting house price for new data
# Replace with your own input data structure if needed
example_data = pd.DataFrame({
    'Size': [2000],  # Example value for Size
    'Location': ['Hyderabad '],  # Example value for Location
    # Add all features present in your dataset here
})
# Ensure that example_data columns match the trained model's feature set
example_data = pd.get_dummies(example_data, drop_first=True)
example_data = example_data.reindex(columns=X.columns, fill_value=0)

predicted_price = model.predict(example_data)
print(f"\nPredicted House Price for example data: ${predicted_price[0]:.2f}")


C:\Users\varik
Dataset Head:
   Id  Area  Bedrooms  Bathrooms  Floors  YearBuilt  Location  Condition  \
0   1  1360         5          4       3       1970  Downtown  Excellent   
1   2  4272         5          4       3       1958  Downtown  Excellent   
2   3  3592         2          2       3       1938  Downtown       Good   
3   4   966         4          2       2       1902  Suburban       Fair   
4   5  4926         1          4       2       1975  Downtown       Fair   

  Garage   Price  
0     No  149919  
1     No  424998  
2     No  266746  
3    Yes  244020  
4    Yes  636056  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Id         2000 non-null   int64 
 1   Area       2000 non-null   int64 
 2   Bedrooms   2000 non-null   int64 
 3   Bathrooms  2000 non-null   int64 
 4   Floors     2000 non-null   int64 
 5   

In [3]:
#we first import libraries 
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [5]:
#get current file path
print(os.getcwd())

C:\Users\varik


In [9]:
# Load the dataset
#data_path = '/mnt/data/House Price Prediction Dataset.csv'  # Update the file path as needed
data = pd.read_csv(r'C:\Users\varik\Downloads\House Price Prediction Dataset.csv')
data

Unnamed: 0,Id,Area,Bedrooms,Bathrooms,Floors,YearBuilt,Location,Condition,Garage,Price
0,1,1360,5,4,3,1970,Downtown,Excellent,No,149919
1,2,4272,5,4,3,1958,Downtown,Excellent,No,424998
2,3,3592,2,2,3,1938,Downtown,Good,No,266746
3,4,966,4,2,2,1902,Suburban,Fair,Yes,244020
4,5,4926,1,4,2,1975,Downtown,Fair,Yes,636056
...,...,...,...,...,...,...,...,...,...,...
1995,1996,4994,5,4,3,1923,Suburban,Poor,No,295620
1996,1997,3046,5,2,1,2019,Suburban,Poor,Yes,580929
1997,1998,1062,5,1,2,1903,Rural,Poor,No,476925
1998,1999,4062,3,1,2,1936,Urban,Excellent,Yes,161119


In [11]:
# Display basic information about the dataset
print("Dataset Head:")
print(data.head())
print("\nDataset Info:")
print(data.info())

Dataset Head:
   Id  Area  Bedrooms  Bathrooms  Floors  YearBuilt  Location  Condition  \
0   1  1360         5          4       3       1970  Downtown  Excellent   
1   2  4272         5          4       3       1958  Downtown  Excellent   
2   3  3592         2          2       3       1938  Downtown       Good   
3   4   966         4          2       2       1902  Suburban       Fair   
4   5  4926         1          4       2       1975  Downtown       Fair   

  Garage   Price  
0     No  149919  
1     No  424998  
2     No  266746  
3    Yes  244020  
4    Yes  636056  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Id         2000 non-null   int64 
 1   Area       2000 non-null   int64 
 2   Bedrooms   2000 non-null   int64 
 3   Bathrooms  2000 non-null   int64 
 4   Floors     2000 non-null   int64 
 5   YearBuilt  2000

In [13]:
# Handle missing values if any
data = data.dropna()

In [15]:
# Define features (X) and target (y)
# Assuming the dataset has 'Price' as the target variable, and the rest are features
X = data.drop(['Price'], axis=1)
y = data['Price']

In [17]:
# Convert categorical columns to numerical using one-hot encoding
X= pd.get_dummies(X, drop_first=True)

In [19]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [21]:
# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)


In [23]:
# Make predictions on the test set
y_pred = model.predict(X_test)


In [27]:
# Evaluate the model
mse = mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

print("\nModel Performance:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")



Model Performance:
Mean Squared Error: 78279764120.86
R-squared: -0.01


In [46]:
example_data = pd.DataFrame({
    'Size': [2000],  # Example value for Size
    'Location': ['Hyderabad '],  # Example value for Location
    # Add all features present in your dataset here
})
# Ensure that example_data columns match the trained model's feature set
example_data = pd.get_dummies(example_data, drop_first=True)
example_data = example_data.reindex(columns=X.columns, fill_value=0)

predicted_price = model.predict(example_data)
print(f"\nPredicted House Price for example data: ${predicted_price[0]:.2f}")



Predicted House Price for example data: $292657.06
