In [2]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib
matplotlib.use('TkAgg')  

In [3]:
df = pd.read_csv("train.csv")

In [4]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
df.shape

(1460, 81)

In [6]:
print("Data types of columns:")
print(df.dtypes)

Data types of columns:
Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object


In [7]:
plt.figure(figsize=(10, 6))
plt.scatter(df['YearBuilt'], df['OverallQual'], alpha=0.5)
plt.title('Year Built vs Overall Quality')
plt.xlabel('Year Built')
plt.ylabel('Overall Quality')
plt.grid(True)
plt.tight_layout()

In [8]:
plt.savefig('year_built_vs_overall_quality.png')
print("Plot saved as 'year_built_vs_overall_quality.png'")

Plot saved as 'year_built_vs_overall_quality.png'


In [9]:
plt.show()

In [10]:
df.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object

In [11]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [12]:
print(df.columns)

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [13]:
print("Shape of data:", df.shape)

Shape of data: (1460, 81)


In [14]:
X = df

In [15]:
print("Shape of features (X):", X.shape)

Shape of features (X): (1460, 81)


In [16]:
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

In [17]:
print("Shape of training set (X_train):", X_train.shape)
print("Shape of testing set (X_test):", X_test.shape)

Shape of training set (X_train): (1168, 81)
Shape of testing set (X_test): (292, 81)


In [None]:
from sklearn.linear_model import LinearRegression

In [34]:
# Step 1: Define target variable and predictor variables
target_variable = 'OverallQual'  # Assuming 'OverallQual' is your target variable
predictor_variables = ['YearBuilt']  # Assuming 'YearBuilt' is your predictor variable

# Step 2: Split the data into training and testing sets
X_train = X_train[predictor_variables]
y_train = X_train[target_variable]
X_test = X_test[predictor_variables]
y_test = X_test[target_variable]

# Step 3: Train the linear regression model
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

# Step 4: Evaluate the model
y_pred = regr.predict(X_test)

# Print coefficients
print('Coefficients:', regr.coef_)
print('Intercept:', regr.intercept_)

# Print mean squared error
print('Mean squared error:', mean_squared_error(y_test, y_pred))

# Print coefficient of determination (R^2 score)
print('Coefficient of determination (R^2 score):', r2_score(y_test, y_pred))

# Plot outputs
plt.scatter(X_test, y_test,  color='black')
plt.plot(X_test, y_pred, color='blue', linewidth=3)
plt.xlabel('Year Built')
plt.ylabel('Overall Quality')
plt.title('Linear Regression: Year Built vs Overall Quality')
plt.show()


KeyError: 'OverallQual'

In [35]:
print(df.head())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [45]:
target_variable = 'SalePrice'  
predictor_variables = ['YearBuilt']  

In [46]:
X_train = df[predictor_variables]
y_train = df[target_variable]

In [47]:
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

In [48]:
y_pred = regr.predict(X_train)

In [49]:
print('Coefficients:', regr.coef_)
print('Intercept:', regr.intercept_)

Coefficients: [1375.37346794]
Intercept: -2530308.2457323517


In [55]:
print('Mean squared error:', mean_squared_error(y_train, y_pred))

Mean squared error: 4582376228.725916


In [51]:
print('Coefficient of determination (R^2 score):', r2_score(y_train, y_pred))

Coefficient of determination (R^2 score): 0.27342162073249154


In [52]:
plt.scatter(X_train, y_train,  color='black')
plt.plot(X_train, y_pred, color='blue', linewidth=3)
plt.xlabel('Year Built')
plt.ylabel('Sale Price')
plt.title('Linear Regression: Year Built vs Sale Price')

Text(0.5, 1.0, 'Linear Regression: Year Built vs Sale Price')

In [54]:
plt.show()

In [57]:
from math import sqrt

target_variable = 'SalePrice'  
predictor_variables = ['YearBuilt'] 

X_train = df[predictor_variables]
y_train = df[target_variable]

regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

y_pred = regr.predict(X_train)

rmse = sqrt(mean_squared_error(y_train, y_pred))

print('Coefficients:', regr.coef_)
print('Intercept:', regr.intercept_)

print('Mean squared error:', mean_squared_error(y_train, y_pred))

print('Root mean squared error (RMSE):', rmse)

print('Coefficient of determination (R^2 score):', r2_score(y_train, y_pred))

plt.scatter(X_train, y_train,  color='black')
plt.plot(X_train, y_pred, color='blue', linewidth=3)
plt.xlabel('Year Built')
plt.ylabel('Sale Price')


Coefficients: [1375.37346794]
Intercept: -2530308.2457323517
Mean squared error: 4582376228.725916
Root mean squared error (RMSE): 67693.25098357971
Coefficient of determination (R^2 score): 0.27342162073249154


Text(0, 0.5, 'Sale Price')

In [58]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv("train.csv")

features = ['OverallQual', 'TotalBsmtSF', 'GrLivArea', 'GarageArea']
target_variable = 'SalePrice'

X = df[features]
y = df[target_variable]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Mean squared error:", mse)
print("Root mean squared error (RMSE):", rmse)
print("Coefficient of determination (R^2 score):", r2)

plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel('Actual Sale Price')
plt.ylabel('Predicted Sale Price')
plt.title('Random Forest Regression: Predicted vs Actual Sale Prices')
plt.show()


Mean squared error: 988675577.864515
Root mean squared error (RMSE): 31443.211952097306
Coefficient of determination (R^2 score): 0.8711038421106364


In [59]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

model = SVR()

try:

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    print("Mean Squared Error:", mse)

except Exception as e:
    print("An error occurred during model training:", e)


Mean Squared Error: 7846787244.336239
