In [5]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [6]:
train_data = pd.read_csv(r"D:\semester6\Datasets\train.csv")
test_data = pd.read_csv(r"D:\semester6\Datasets\test.csv")

In [7]:
test_data.head()

Unnamed: 0,id,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,...,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,1,1043,1,1.8,1,14,0,5,0.1,193,...,16,226,1412,3476,12,7,2,0,1,0
1,2,841,1,0.5,1,4,1,61,0.8,191,...,12,746,857,3895,6,0,7,1,0,0
2,3,1807,1,2.8,0,1,0,27,0.9,186,...,4,1270,1366,2396,17,10,10,0,1,1
3,4,1546,0,0.5,1,18,1,25,0.5,96,...,20,295,1752,3893,10,0,7,1,1,0
4,5,1434,0,1.4,0,11,1,49,0.5,108,...,18,749,810,1773,15,8,7,1,0,1


In [8]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_sc

In [9]:
# Split features and target variable for train and test data
X_train = train_data.drop(columns=['price_range'])
y_train = train_data['price_range']

In [10]:
feature_names = X_train.columns.tolist()
feature_names

['battery_power',
 'blue',
 'clock_speed',
 'dual_sim',
 'fc',
 'four_g',
 'int_memory',
 'm_dep',
 'mobile_wt',
 'n_cores',
 'pc',
 'px_height',
 'px_width',
 'ram',
 'sc_h',
 'sc_w',
 'talk_time',
 'three_g',
 'touch_screen',
 'wifi']

In [11]:
# Initialize a base model
base_model = DecisionTreeClassifier()

# Initialize Sequential Feature Selector
sfs = SFS(base_model,
          k_features=10,
          forward=True,
          floating=False,
          scoring='accuracy')

# Perform SFS
sfs.fit(X_train, y_train)


In [12]:
pd.DataFrame.from_dict(sfs.get_metric_dict()).T

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1,"(13,)","[0.655, 0.6625, 0.695, 0.6775, 0.67]",0.672,"(ram,)",0.017646,0.01373,0.006865
2,"(0, 13)","[0.745, 0.7075, 0.7075, 0.7225, 0.75]",0.7265,"(battery_power, ram)",0.023224,0.018069,0.009035
3,"(0, 12, 13)","[0.8475, 0.8425, 0.86, 0.825, 0.85]",0.845,"(battery_power, px_width, ram)",0.014795,0.011511,0.005755
4,"(0, 11, 12, 13)","[0.84, 0.8575, 0.8625, 0.8625, 0.865]",0.8575,"(battery_power, px_height, px_width, ram)",0.011674,0.009083,0.004541
5,"(0, 8, 11, 12, 13)","[0.85, 0.8575, 0.855, 0.8775, 0.8625]",0.8605,"(battery_power, mobile_wt, px_height, px_width...",0.012091,0.009407,0.004704
6,"(0, 8, 11, 12, 13, 17)","[0.8625, 0.8575, 0.86, 0.8525, 0.8725]",0.861,"(battery_power, mobile_wt, px_height, px_width...",0.008526,0.006633,0.003317
7,"(0, 8, 11, 12, 13, 16, 17)","[0.85, 0.87, 0.855, 0.86, 0.8625]",0.8595,"(battery_power, mobile_wt, px_height, px_width...",0.008717,0.006782,0.003391
8,"(0, 8, 11, 12, 13, 16, 17, 18)","[0.8475, 0.88, 0.845, 0.86, 0.8525]",0.857,"(battery_power, mobile_wt, px_height, px_width...",0.016181,0.01259,0.006295
9,"(0, 8, 11, 12, 13, 16, 17, 18, 19)","[0.84, 0.8575, 0.8475, 0.8625, 0.86]",0.8535,"(battery_power, mobile_wt, px_height, px_width...",0.010868,0.008456,0.004228
10,"(0, 3, 8, 11, 12, 13, 16, 17, 18, 19)","[0.86, 0.87, 0.8425, 0.8525, 0.855]",0.856,"(battery_power, dual_sim, mobile_wt, px_height...",0.011603,0.009028,0.004514


In [13]:
# Get maximum average accuracy achieved
max_avg_accuracy = max(sfs.subsets_.values(), key=lambda x: x['avg_score'])

# Print maximum average accuracy
print("Max Average Accuracy:", max_avg_accuracy['avg_score'])

Max Average Accuracy: 0.861


In [14]:
# Get feature subset that gives the maximum average accuracy
max_avg_accuracy_features = list(sfs.subsets_[max(sfs.subsets_.keys(), key=lambda k: sfs.subsets_[k]['avg_score'])]['feature_names'])

# Print features that give the maximum average accuracy
print("Features giving Max Average Accuracy:", max_avg_accuracy_features)

Features giving Max Average Accuracy: ['battery_power', 'mobile_wt', 'px_height', 'px_width', 'ram', 'three_g']


In [15]:
rf = RandomForestClassifier(n_estimators=10, random_state=42)

In [16]:
sfs2 = SFS(rf,
          k_features=10,
          forward=False,
          floating=False,
          scoring='accuracy')

# Perform SFS
sfs2.fit(X_train, y_train)

In [17]:
pd.DataFrame.from_dict(sfs2.get_metric_dict()).T

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
20,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0.79, 0.76, 0.7625, 0.7825, 0.7625]",0.7715,"(battery_power, blue, clock_speed, dual_sim, f...",0.01582,0.012309,0.006154
19,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14,...","[0.8425, 0.805, 0.85, 0.825, 0.8325]",0.831,"(battery_power, blue, clock_speed, dual_sim, f...",0.019974,0.01554,0.00777
18,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14,...","[0.845, 0.8275, 0.845, 0.84, 0.8125]",0.834,"(battery_power, blue, clock_speed, dual_sim, f...",0.016079,0.01251,0.006255
17,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14,...","[0.82, 0.8525, 0.88, 0.8275, 0.8375]",0.8435,"(battery_power, blue, clock_speed, dual_sim, f...",0.027311,0.021249,0.010624
16,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14,...","[0.85, 0.88, 0.8775, 0.8425, 0.87]",0.864,"(battery_power, blue, clock_speed, dual_sim, f...",0.019344,0.01505,0.007525
15,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 17,...","[0.8175, 0.83, 0.845, 0.825, 0.805]",0.8245,"(battery_power, blue, clock_speed, dual_sim, f...",0.017051,0.013266,0.006633
14,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 17)","[0.8475, 0.825, 0.82, 0.86, 0.835]",0.8375,"(battery_power, blue, clock_speed, dual_sim, f...",0.018846,0.014663,0.007331
13,"(0, 1, 2, 3, 4, 5, 6, 8, 9, 11, 12, 13, 17)","[0.8175, 0.845, 0.845, 0.845, 0.8425]",0.839,"(battery_power, blue, clock_speed, dual_sim, f...",0.013873,0.010794,0.005397
12,"(0, 1, 2, 3, 4, 6, 8, 9, 11, 12, 13, 17)","[0.84, 0.8375, 0.8675, 0.845, 0.8325]",0.8445,"(battery_power, blue, clock_speed, dual_sim, f...",0.015663,0.012186,0.006093
11,"(0, 1, 2, 3, 4, 8, 9, 11, 12, 13, 17)","[0.855, 0.8825, 0.8925, 0.855, 0.8325]",0.8635,"(battery_power, blue, clock_speed, dual_sim, f...",0.027611,0.021483,0.010741


In [18]:
# Get maximum average accuracy achieved
max_avg_accuracy = max(sfs2.subsets_.values(), key=lambda x: x['avg_score'])

# Print maximum average accuracy
print("Max Average Accuracy:", max_avg_accuracy['avg_score'])

Max Average Accuracy: 0.8710000000000001


In [20]:
# Get feature subset that gives the maximum average accuracy
max_avg_accuracy_features = list(sfs2.subsets_[max(sfs2.subsets_.keys(), key=lambda k: sfs2.subsets_[k]['avg_score'])]['feature_names'])

# Print features that give the maximum average accuracy
print("Features giving Max Average Accuracy:", max_avg_accuracy_features)

Features giving Max Average Accuracy: ['battery_power', 'blue', 'clock_speed', 'fc', 'mobile_wt', 'n_cores', 'px_height', 'px_width', 'ram', 'three_g']
