In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder , StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression ,Ridge

In [None]:
df=pd.read_csv("/content/Cleaned CAR_Dataset.csv")

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,0,HyundaiSantroXing,Hyundai,2007,80000,45000,Petrol
1,1,MahindraJeepCL550,Mahindra,2006,425000,40,Diesel
2,2,HyundaiGrandi10,Hyundai,2014,325000,28000,Petrol
3,3,FordEcoSportTitanium,Ford,2014,575000,36000,Diesel
4,4,FordFigo,Ford,2012,175000,41000,Diesel


In [None]:
df=df.drop("Unnamed: 0" , axis=1)

In [None]:
df.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,HyundaiSantroXing,Hyundai,2007,80000,45000,Petrol
1,MahindraJeepCL550,Mahindra,2006,425000,40,Diesel
2,HyundaiGrandi10,Hyundai,2014,325000,28000,Petrol
3,FordEcoSportTitanium,Ford,2014,575000,36000,Diesel
4,FordFigo,Ford,2012,175000,41000,Diesel


In [None]:
X=df.drop(['Price'],axis=1)
y=df['Price']

In [None]:
X_train, X_test , y_train , y_test = train_test_split(X,y , test_size=0.2 )

In [None]:
numeric_columns=['year','kms_driven']
categorical_columns=['company','name','fuel_type']
# make sure you pass the names of column intead of  DataFrame i.e df['Price] etc

In [None]:
preprocessor= ColumnTransformer(transformers=[
    ("numeric:" , StandardScaler(),numeric_columns),
    ("Catagorical :", OneHotEncoder(handle_unknown="ignore"),categorical_columns)
], remainder="drop"
        )

In [None]:

model = Pipeline(steps=[
    ("Transformer", preprocessor),
    ("Linear Regression",LinearRegression())])

In [None]:
model.fit(X_train,y_train)

In [None]:
y_pred=model.predict(X_test)
y_pred

array([ 267951.88588935,  633983.54577061,  602870.0091738 ,
         41812.54001954,  198342.76996163,  555864.62212241,
        428977.22768366,  365839.14591809,  263992.07196722,
        310113.43398044,  273737.27931245,  365607.1971362 ,
        416388.26863731,  517858.5304479 ,  424024.10430501,
        548077.67353692,  350147.69474572,   38373.55554069,
        692210.55877023,  341248.42639248,  728301.68419655,
        413862.75669065,  333541.04234742,  181996.43884342,
        259764.56322455,  407077.85965015,  292372.2933645 ,
        204012.63166899,  303911.28985615,  506823.08672379,
        298873.8335245 ,  125506.38203122,  501002.65263936,
        282201.81925442,  700539.62147476,  503293.30249124,
        310443.19245151,  216999.19814831,  258845.31007012,
        713586.816244  ,  370337.82545168,  310279.62036621,
        382473.81995276, 1443441.26613049,  350122.85305445,
        170500.70336843,  229998.28982919,  216599.45913205,
        213437.79789198,

**BELOW CODE IS FOR FINDING THE BEST RANDOM STATE NUMBER IN ORDER TO GET BEST R2SCORE**



In [None]:
# lets variate the R2_score
score=[]  # it will give me the highest value, will be put in random state
for i in range(100):
  X_train, X_test , y_train , y_test = train_test_split(X,y , test_size=0.2 , random_state=i)  # random_state=i
  model = Pipeline(steps=[
    ("Transformer", preprocessor),
    ("Linear Regression",LinearRegression())])
  model.fit(X_train,y_train)
  y_pred=model.predict(X_test)
  # print(r2_score(y_test, y_pred),i)  # here i is just used for index intentions
  score.append(r2_score(y_test, y_pred))
# every execution score value changes

In [None]:
score[np.argmax(score)]  # np.argmax returns the index number of highest score


0.8535977844219308

**MODEL EVALUATION**

In [None]:
X_train, X_test , y_train , y_test = train_test_split(X,y , test_size=0.2 , random_state=np.argmax(score))

model = Pipeline(steps=[
    ("Transformer", preprocessor),
    ("Linear Regression",LinearRegression())])

model.fit(X_train,y_train)
y_pred=model.predict(X_test)

r2score = r2_score(y_test, y_pred)
print("R2 Score:", np.round(r2score*100,2),"%")



R2 Score: 85.36 %


In [None]:
import pickle
with open("LinearRegression_CarPrice.pkl",'wb') as file:
  model=pickle.dump(model,file)

In [None]:
from google.colab import files
#files.download("LinearRegression_CarPrice.pkl")

In [None]:
from google.colab import files
files.download("LinearRegression_CarPrice.ipynb")