In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("CO2_emission.csv")

In [None]:
df.shape

(7385, 12)

In [None]:
df.head()

Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
0,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7385 entries, 0 to 7384
Data columns (total 12 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Make                              7385 non-null   object 
 1   Model                             7385 non-null   object 
 2   Vehicle Class                     7385 non-null   object 
 3   Engine Size(L)                    7385 non-null   float64
 4   Cylinders                         7385 non-null   int64  
 5   Transmission                      7385 non-null   object 
 6   Fuel Type                         7385 non-null   object 
 7   Fuel Consumption City (L/100 km)  7385 non-null   float64
 8   Fuel Consumption Hwy (L/100 km)   7385 non-null   float64
 9   Fuel Consumption Comb (L/100 km)  7385 non-null   float64
 10  Fuel Consumption Comb (mpg)       7385 non-null   int64  
 11  CO2 Emissions(g/km)               7385 non-null   int64  
dtypes: flo

In [None]:
df.duplicated().sum()

1103

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.drop(['Make', 'Model', 'Vehicle Class'], axis=1, inplace=True)

In [None]:
cat_col = [col for col in df.columns if df[col].dtype == 'O']
cat_col

['Transmission', 'Fuel Type']

In [None]:
for col in cat_col:
    print(col, df[col].unique())

Transmission ['AS5' 'M6' 'AV7' 'AS6' 'AM6' 'A6' 'AM7' 'AV8' 'AS8' 'A7' 'A8' 'M7' 'A4'
 'M5' 'AV' 'A5' 'AS7' 'A9' 'AS9' 'AV6' 'AS4' 'AM5' 'AM8' 'AM9' 'AS10'
 'A10' 'AV10']
Fuel Type ['Z' 'D' 'X' 'E' 'N']


In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in cat_col:
    df[col] = le.fit_transform(df[col])

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6282 entries, 0 to 7384
Data columns (total 9 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Engine Size(L)                    6282 non-null   float64
 1   Cylinders                         6282 non-null   int64  
 2   Transmission                      6282 non-null   int64  
 3   Fuel Type                         6282 non-null   int64  
 4   Fuel Consumption City (L/100 km)  6282 non-null   float64
 5   Fuel Consumption Hwy (L/100 km)   6282 non-null   float64
 6   Fuel Consumption Comb (L/100 km)  6282 non-null   float64
 7   Fuel Consumption Comb (mpg)       6282 non-null   int64  
 8   CO2 Emissions(g/km)               6282 non-null   int64  
dtypes: float64(4), int64(5)
memory usage: 490.8 KB


In [None]:
df.head()

Unnamed: 0,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
0,2.0,4,14,4,9.9,6.7,8.5,33,196
1,2.4,4,25,4,11.2,7.7,9.6,29,221
2,1.5,4,22,4,6.0,5.8,5.9,48,136
3,3.5,6,15,4,12.7,9.1,11.1,25,255
4,3.5,6,15,4,12.1,8.7,10.6,27,244


In [None]:
X, y = df.drop('CO2 Emissions(g/km)', axis=1), df['CO2 Emissions(g/km)']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
model = GradientBoostingRegressor()

In [None]:
model = model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import r2_score
print(r2_score(y_test, y_pred))

0.9967535315086232


In [None]:
param_grid = {
    'n_estimators': [100, 250, 400],
    'learning_rate': [0.01, 0.1, 0.05, 0.3]
}

In [None]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='r2')

In [None]:
grid = grid.fit(X_train, y_train)

In [None]:
grid.best_params_

{'learning_rate': 0.3, 'n_estimators': 400}

In [None]:
grid.best_score_

0.9967408377144873

In [None]:
y_pred_grid = grid.predict(X_test)

In [None]:
print(r2_score(y_test, y_pred_grid))

0.9976957434286017
