# Machine Learning - Homework 7

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import (
    VarianceThreshold,
    mutual_info_classif,
    SequentialFeatureSelector,
    RFE
)
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression

In [None]:
def read_csv_and_analyze_dataset(path_to_csv):
  dataframe = pd.read_csv(path_to_csv)

  print("The dataset:")
  print(dataframe, end='\n\n')

  print("All the unique values for all columns that contain discrete values:")

  for column_name in dataframe:
    if not pd.api.types.is_numeric_dtype(dataframe[column_name]):
      print(f"\n{column_name}:")

      for unique_value in dataframe[column_name].unique():
        print(f"{unique_value} ({len(dataframe.loc[dataframe[column_name] == unique_value])})")

  print()

  return dataframe

## For the customer satisfaction data (370 attributes) from https://www.kaggle.com/competitions/santander-customer-satisfaction/overview

### Apply 2 filter methods and 1 wrapper technique with the ML model of your choice

In [None]:
# Note: The below dataset is the "Santander Customer Satisfaction" dataset
# from Kaggle
X = read_csv_and_analyze_dataset('train.csv')

y = X.pop('TARGET')

The dataset:
           ID  var3  var15  imp_ent_var16_ult1  imp_op_var39_comer_ult1  \
0           1     2     23                 0.0                      0.0   
1           3     2     34                 0.0                      0.0   
2           4     2     23                 0.0                      0.0   
3           8     2     37                 0.0                    195.0   
4          10     2     39                 0.0                      0.0   
...       ...   ...    ...                 ...                      ...   
76015  151829     2     48                 0.0                      0.0   
76016  151830     2     39                 0.0                      0.0   
76017  151835     2     23                 0.0                      0.0   
76018  151836     2     25                 0.0                      0.0   
76019  151838     2     46                 0.0                      0.0   

       imp_op_var39_comer_ult3  imp_op_var40_comer_ult1  \
0                          

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

feature_selection = VarianceThreshold()

feature_selection.fit(X_train)

feature_selection.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False, False, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True, False, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [None]:
feature_importance = mutual_info_classif(X_train, y_train)

feature_importance

array([7.65932808e-04, 1.38884367e-04, 1.24364455e-02, 0.00000000e+00,
       0.00000000e+00, 5.23994249e-05, 8.10997084e-04, 6.90339175e-04,
       4.92467399e-04, 7.56318234e-04, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 2.42100516e-04, 0.00000000e+00, 2.27605183e-04,
       1.36331266e-03, 0.00000000e+00, 3.35908417e-04, 0.00000000e+00,
       7.29939179e-04, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.53435457e-02, 1.97413064e-02, 0.00000000e+00, 3.71676720e-04,
       1.31807477e-03, 9.83198696e-04, 4.92127075e-04, 1.43254176e-03,
       0.00000000e+00, 3.06589158e-03, 1.05610679e-03, 1.18760080e-04,
       5.73011529e-04, 0.00000000e+00, 8.50647686e-04, 1.22617101e-03,
       6.71222565e-05, 1.72536779e-03, 0.00000000e+00, 1.03098900e-03,
       5.21671768e-04, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.75442221e-03, 1.90644996e-04, 8.42026763e-04,
       1.40134721e-03, 5.40769890e-04, 0.00000000e+00, 3.14730178e-04,
      

In [None]:
decision_tree = DecisionTreeClassifier()

decision_tree.fit(X_train, y_train)

y_predict = decision_tree.predict(X_test)

accuracy_score(y_test, y_predict)

0.9287687450670876

In [None]:
forward_feature_selection = SequentialFeatureSelector(
    decision_tree,
    cv=5,
    n_features_to_select=3
)

X_train = forward_feature_selection.fit_transform(X_train, y_train)

features = forward_feature_selection.support_

features

array([False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [None]:
X_test = forward_feature_selection.transform(X_test)

decision_tree.fit(X_train, y_train)

y_predict = decision_tree.predict(X_test)

accuracy_score(y_test, y_predict)

0.9604709287029729

### For the chosen model, apply a grid search or LHS for the parameter tuning

In [None]:
tree_params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [5, 10, 50, 150]
}

clf = GridSearchCV(DecisionTreeClassifier(), tree_params, cv=5)

clf.fit(X_train, y_train)

clf.cv_results_

{'mean_fit_time': array([0.02195859, 0.02482691, 0.0184979 , 0.01529317, 0.01236629,
        0.01490951, 0.01567621, 0.01709371]),
 'std_fit_time': array([0.00305442, 0.0011223 , 0.00485811, 0.00137542, 0.00068196,
        0.00034276, 0.00141082, 0.00235884]),
 'mean_score_time': array([0.00370255, 0.0040195 , 0.00313764, 0.00283575, 0.00235448,
        0.00262089, 0.00269313, 0.00305605]),
 'std_score_time': array([6.94741430e-05, 1.41124063e-04, 6.54499662e-04, 3.84582988e-04,
        6.56105404e-05, 2.91165218e-05, 7.44783007e-05, 6.93634791e-04]),
 'param_criterion': masked_array(data=['gini', 'gini', 'gini', 'gini', 'entropy', 'entropy',
                    'entropy', 'entropy'],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_max_depth': masked_array(data=[5, 10, 50, 150, 5, 10, 50, 150],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
    

## Study the method Recursive Feature Elimination and implement it on the Iris data set in R or Python

In [None]:
def transform_ordinal_columns(dataframe, column_names):
  dataframe[column_names] = OrdinalEncoder().fit_transform(
      dataframe[column_names]
  )

  return dataframe

In [None]:
iris_X = read_csv_and_analyze_dataset('iris.csv')

# This column is not actually ordinal, but we will need to apply an 'ordinal'
# transformation to it in order to use it as 'y' in a multiclass classification
# problem
iris_X = transform_ordinal_columns(iris_X, ['variety'])

iris_y = iris_X.pop('variety')

The dataset:
     sepal.length  sepal.width  petal.length  petal.width    variety
0             5.1          3.5           1.4          0.2     Setosa
1             4.9          3.0           1.4          0.2     Setosa
2             4.7          3.2           1.3          0.2     Setosa
3             4.6          3.1           1.5          0.2     Setosa
4             5.0          3.6           1.4          0.2     Setosa
..            ...          ...           ...          ...        ...
145           6.7          3.0           5.2          2.3  Virginica
146           6.3          2.5           5.0          1.9  Virginica
147           6.5          3.0           5.2          2.0  Virginica
148           6.2          3.4           5.4          2.3  Virginica
149           5.9          3.0           5.1          1.8  Virginica

[150 rows x 5 columns]

All the unique values for all columns that contain discrete values:

variety:
Setosa (50)
Versicolor (50)
Virginica (50)



In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    iris_X,
    iris_y,
    test_size=0.2
)

decision_tree = DecisionTreeClassifier()

rfe = RFE(decision_tree, n_features_to_select=3)

X_rfe = rfe.fit_transform(X_train, y_train)

print("Selected features:", rfe.support_)
print("Ranking of features:", rfe.ranking_)

decision_tree = rfe.estimator_

Selected features: [ True False  True  True]
Ranking of features: [1 2 1 1]


## Study Lasso regularization and implement it on the Iris data set in R or Python

In [None]:
logistic_regression = LogisticRegression(
    penalty='l1',
    solver='liblinear',
    multi_class='auto'
)

logistic_regression.fit(X_train, y_train)

y_predict = logistic_regression.predict(X_test)

accuracy_score(y_test, y_predict)

0.9333333333333333