In [7]:
# Import necessary libraries
from sklearn.linear_model import *
from sklearn.metrics import *
from sklearn.linear_model import *
from sklearn.model_selection import *
from sklearn.compose import *
from sklearn.model_selection import *
from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.neighbors import *
from sklearn.naive_bayes import *
from sklearn.pipeline import *
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
from sklearn.linear_model import *
import os

warnings.simplefilter('ignore')

In [8]:

# Change directory to the dataset location
os.chdir(r"D:\Datasets")

# Read the CSV data
df = pd.read_csv("Housing.csv")

In [13]:
ohc = OneHotEncoder(sparse_output=False).set_output(transform='pandas')

ct = make_column_transformer((ohc,['driveway','recroom','fullbase','gashw','airco','prefarea',]),
                             ('passthrough',['price','lotsize','bedrooms','bathrms','stories','garagepl']),verbose_feature_names_out=False)

'''This line creates a ColumnTransformer object, which applies different transformations to different columns of a DataFrame.
The first argument is a tuple of transformations to apply to specific columns. 
In this case, the OneHotEncoder is applied to the columns 'driveway', 'recroom', 'fullbase', 'gashw', 'airco', and 'prefarea'.
The second argument is another tuple of transformations to apply to specific columns. 
In this case, the columns 'price', 'lotsize', 'bedrooms', 'bathrms', 'stories', and 'garagepl' are left unchanged (i.e., "passed through")'''

# This line sets the output format of the ColumnTransformer to pandas DataFrame for easier handling.
ct = ct.set_output(transform='pandas') 

# This line fits the ColumnTransformer to the DataFrame df and applies the transformations
dum_df_1 = ct.fit_transform(df)

# This line prints the columns of the transformed DataFrame.
print(dum_df_1.columns)

# This line creates a new DataFrame dum_df_2 by converting categorical variables in df into binary columns using the get_dummies function
dum_df_2 = pd.get_dummies(df,drop_first=True)

# This line prints the columns of the new DataFrame.
print(dum_df_2.columns)

Index(['driveway_no', 'driveway_yes', 'recroom_no', 'recroom_yes',
       'fullbase_no', 'fullbase_yes', 'gashw_no', 'gashw_yes', 'airco_no',
       'airco_yes', 'prefarea_no', 'prefarea_yes', 'price', 'lotsize',
       'bedrooms', 'bathrms', 'stories', 'garagepl'],
      dtype='object')
Index(['price', 'lotsize', 'bedrooms', 'bathrms', 'stories', 'garagepl',
       'driveway_yes', 'recroom_yes', 'fullbase_yes', 'gashw_yes', 'airco_yes',
       'prefarea_yes'],
      dtype='object')


In [14]:
# This line removes the 'price' column from the DataFrame dum_df_2 and assigns the result to x
x=dum_df_2.drop('price',axis=1)

# This line assigns the 'price' column from dum_df_2 to y
y=dum_df_2['price']

# This line creates an instance of the LogisticRegression class, which is a type of machine learning model
lr = LogisticRegression()

# This line trains the LogisticRegression model on the data in x and y
lr.fit(x,y)


####unlabeled data

tst = pd.read_csv('tst_Housing.csv')  #taking same dataset but different rown or columns for getting error(sir give this mismatched data)
dum_df_2 = pd.get_dummies(tst,drop_first=True)
print(dum_df_2.columns)
y_pred = lr.predict(dum_df_2)

Index(['lotsize', 'bedrooms', 'bathrms', 'stories', 'garagepl', 'recroom_yes',
       'fullbase_yes'],
      dtype='object')


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- airco_yes
- driveway_yes
- gashw_yes
- prefarea_yes


# solving this error with OneHotEncoder and ColumnTransformer

In [4]:
# This line creates an instance of the OneHotEncoder class, which is used to convert categorical variables into binary columns.
# The sparse_output=False parameter ensures that the output is dense (i.e., not sparse) for easier handling.
# The drop='first' parameter ensures that the first category in each categorical variable is not included in the new DataFrame
ohc = OneHotEncoder(sparse_output=False,drop='first').set_output(transform='pandas')

ct = make_column_transformer((ohc,make_column_selector(dtype_include=object)),
                             ('passthrough',make_column_selector(dtype_exclude=object)),verbose_feature_names_out=False)

'''This line creates a ColumnTransformer object, which applies different transformations to different columns of a DataFrame.
The first argument is a tuple of transformations to apply to specific columns. In this case, the OneHotEncoder is applied to all columns with object (categorical) data types.
The second argument is another tuple of transformations to apply to specific columns. In this case, all columns with non-object (numerical) data types are left unchanged (i.e., "passed through").
The make_column_selector function is used to select columns based on their data types.
The verbose_feature_names_out=False parameter suppresses verbose output about feature names.'''

x = df.drop('price',axis=1)
y=df['price']
ct = ct.set_output(transform='pandas')

# This line fits the ColumnTransformer to the DataFrame x and applies the transformations.
x_trans = ct.fit_transform(x)  # yaha sare column pehchan liye 
print(x_trans.columns)

tst = pd.read_csv('tst_Housing.csv')


tst_trans = ct.transform(tst)  # or yaha jo columns nahi he use bana liya khudse
print(tst_trans.columns)

lr = LinearRegression()

# This line trains the LinearRegression model on the transformed training data in x_trans and y
lr.fit(x_trans,y)

# This line uses the trained LinearRegression model to predict the 'price' for each row in the transformed unlabeled data tst_trans
y_pred = lr.predict(tst_trans)
y_pred

Index(['driveway_yes', 'recroom_yes', 'fullbase_yes', 'gashw_yes', 'airco_yes',
       'prefarea_yes', 'lotsize', 'bedrooms', 'bathrms', 'stories',
       'garagepl'],
      dtype='object')
Index(['driveway_yes', 'recroom_yes', 'fullbase_yes', 'gashw_yes', 'airco_yes',
       'prefarea_yes', 'lotsize', 'bedrooms', 'bathrms', 'stories',
       'garagepl'],
      dtype='object')


array([65151.39992985, 42650.08901117, 41137.92877603, 76249.38318098])

# Conclusion: both the sets(x_trans & tst_trans) column schemas are same 

# solving this problem using pipeline

In [5]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([('TRNS',ct),('LR',lr)])
pipe.fit(x,y)
y_pred = pipe.predict(tst)
y_pred

array([65151.39992985, 42650.08901117, 41137.92877603, 76249.38318098])