# Practice Assignment

## Imports

In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV


##Q1 and Q2

 Load the wine dataset from sklearn.

Split the dataset into train and test set with 70:30 ratio with 
1\
random_state = 1\
.
Use AdaBoostClassifier with random_state = 1 (other values being default).
Train the 'model' and compute the 'score' on training data and test data.

In [None]:
X,y = load_wine(as_frame = True, return_X_y = True)

In [None]:
X.shape

(178, 13)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

In [None]:
classifier = AdaBoostClassifier(random_state = 1)

In [None]:
classifier.fit(X_train,y_train)

AdaBoostClassifier(random_state=1)

In [None]:
classifier.score(X_train, y_train)

0.6774193548387096

In [None]:
classifier.score(X_test, y_test)

0.5370370370370371

## Q3-Q5

In [None]:
param_grid = {'n_estimators':  [100,500,1000],
              'learning_rate': [0.5,1,2]}

ada_gridcv = GridSearchCV(classifier,
                          param_grid = param_grid,
                          cv = 4,
                          return_train_score = True)
ada_gridcv.fit(X_train,y_train)

GridSearchCV(cv=4, estimator=AdaBoostClassifier(random_state=1),
             param_grid={'learning_rate': [0.5, 1, 2],
                         'n_estimators': [100, 500, 1000]},
             return_train_score=True)

In [None]:
ada_gridcv.score(X_test, y_test)

0.9629629629629629

In [None]:
ada_gridcv.best_params_

{'learning_rate': 0.5, 'n_estimators': 100}

## Q6-Q7

Write a code to predict the score on the training set and the testing set using Voting Classifier on wine dataset Write your code based on the following keypoints:\

Split the dataset into train and test set with 70:30 ratio with 
1
random_state = 1\
.
Import model. Keep estimators as BaggingClassifier, Random Forest, GradientBoosting Classifier with random_state = 1
Train the 'model' and compute the 'score' on training data and test data.

In [None]:
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier

In [None]:
clf1 = BaggingClassifier(random_state = 1)
clf2 = RandomForestClassifier(random_state = 1)
clf3 = GradientBoostingClassifier(random_state = 1)

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
eclf  = VotingClassifier(estimators = [('bg',clf1),('rf',clf2), ('gb', clf3)])
eclf.fit(X_train,y_train)

VotingClassifier(estimators=[('bg', BaggingClassifier(random_state=1)),
                             ('rf', RandomForestClassifier(random_state=1)),
                             ('gb',
                              GradientBoostingClassifier(random_state=1))])

In [None]:
eclf.score(X_train, y_train)

1.0

In [None]:
eclf.score(X_test, y_test)

0.9814814814814815

# Graded Assignment

## Q1- Q4

### Preprocessing

In [3]:
data = pd.read_csv('/content/cars_data.csv')

In [4]:
data.isnull().sum()

Make           0
Model          0
Type           0
Origin         0
DriveTrain     0
MSRP           0
Invoice        0
EngineSize     0
Cylinders      2
Horsepower     0
MPG_City       0
MPG_Highway    0
Weight         0
Wheelbase      0
Length         0
dtype: int64

In [5]:
data.shape

(428, 15)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 428 entries, 0 to 427
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Make         428 non-null    object 
 1   Model        428 non-null    object 
 2   Type         428 non-null    object 
 3   Origin       428 non-null    object 
 4   DriveTrain   428 non-null    object 
 5   MSRP         428 non-null    object 
 6   Invoice      428 non-null    object 
 7   EngineSize   428 non-null    float64
 8   Cylinders    426 non-null    float64
 9   Horsepower   428 non-null    int64  
 10  MPG_City     428 non-null    int64  
 11  MPG_Highway  428 non-null    int64  
 12  Weight       428 non-null    int64  
 13  Wheelbase    428 non-null    int64  
 14  Length       428 non-null    int64  
dtypes: float64(2), int64(6), object(7)
memory usage: 50.3+ KB


In [7]:
data= data.dropna()

In [8]:
data.shape

(426, 15)

In [9]:
data.drop('Invoice', inplace = True , axis = 1)

In [10]:
data.shape

(426, 14)

In [11]:

data['MSRP'] = data["MSRP"].str.replace("$", "")
data['MSRP'] = data["MSRP"].str.replace(",", "")
data['MSRP'] = data["MSRP"].astype(int)

  


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 426 entries, 0 to 427
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Make         426 non-null    object 
 1   Model        426 non-null    object 
 2   Type         426 non-null    object 
 3   Origin       426 non-null    object 
 4   DriveTrain   426 non-null    object 
 5   MSRP         426 non-null    int64  
 6   EngineSize   426 non-null    float64
 7   Cylinders    426 non-null    float64
 8   Horsepower   426 non-null    int64  
 9   MPG_City     426 non-null    int64  
 10  MPG_Highway  426 non-null    int64  
 11  Weight       426 non-null    int64  
 12  Wheelbase    426 non-null    int64  
 13  Length       426 non-null    int64  
dtypes: float64(2), int64(7), object(5)
memory usage: 49.9+ KB


In [13]:
data['MSRP'][:5]

0    36945
1    23820
2    26990
3    33195
4    43755
Name: MSRP, dtype: int64

In [14]:
data = pd.get_dummies(data, columns = ['Make','Model','Type','Origin', 'DriveTrain'])

In [15]:
data.shape

(426, 482)

In [16]:
y = data['MSRP']
X = data.drop('MSRP',axis = 1)

### Splitting

In [17]:
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size =0.3 , random_state = 1)

In [18]:
from sklearn.ensemble import  BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor


In [19]:
clf1  = BaggingRegressor(random_state = 1)
clf1.fit(X_train,y_train)

BaggingRegressor(random_state=1)

In [20]:
clf1.score(X_train, y_train)

0.9567015457407546

In [21]:
clf1.score(X_test,y_test)

0.7949163237522345

In [22]:
clf2 = RandomForestRegressor(random_state =1)
clf2.fit(X_train,y_train)

RandomForestRegressor(random_state=1)

In [23]:
clf2.score(X_test, y_test)

0.8369440882741959

In [24]:
clf3 = GradientBoostingRegressor(random_state = 1)
clf3.fit(X_train, y_train)

GradientBoostingRegressor(random_state=1)

In [25]:
clf3.score(X_test, y_test)

0.8270485242007507

In [26]:
clf4 = AdaBoostRegressor(random_state =1)
clf4.fit(X_train, y_train)

AdaBoostRegressor(random_state=1)

In [27]:
clf4.score(X_test, y_test)

0.711903287964231

#Q5-7

In [28]:
from sklearn.model_selection import GridSearchCV

In [29]:
param_grid = {'n_estimators':[100,500,1000],
              'learning_rate': [0.5, 1, 2]}

ad_gridcv = GridSearchCV(AdaBoostRegressor(random_state =1),
                         param_grid = param_grid,
                         cv = 4)

ad_gridcv.fit(X_train, y_train)

GridSearchCV(cv=4, estimator=AdaBoostRegressor(random_state=1),
             param_grid={'learning_rate': [0.5, 1, 2],
                         'n_estimators': [100, 500, 1000]})

In [None]:
ad_gridcv.score(X_train, y_train)

0.9084901949117243

In [None]:
ad_gridcv.best_params_

{'learning_rate': 1, 'n_estimators': 1000}