# Weka machine learning toolkit

* [Download Weka](https://www.cs.waikato.ac.nz/~ml/weka/)
* [Data mining with Weka video series](https://www.youtube.com/user/WekaMOOC)

# Exercise 6

For this exercise you can use either Python with sklearn or Weka.

* Using the UCI mushroom dataset from the last exercise, perform a feature selection using a classifier evaluator. Which features are most discriminitave?
* Use principal components analysis to construct a reduced space. Which combination of features explain the most variance in the dataset?
* Do you see any overlap between the PCA features and those obtained from feature selection?


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn import decomposition
import numpy as np

df = pd.read_csv('agaricus-lepiota.csv')
labels = df.columns
labels = labels[1:labels.size]

x,y = pd.get_dummies(df.loc[:,labels]) ,pd.get_dummies(df['edibility'])

print(x.shape)

skb = SelectKBest(chi2, k=5)
skb.fit(x, y)
x_new = skb.transform(x)

print(x_new.shape)

# Fetch the selected feature indices and print the corresponding feature names
new_features = np.array(x.columns)[skb.get_support(indices=True)]
print(new_features)


(8124, 117)
(8124, 5)
['odor_f' 'odor_n' 'gill-color_b' 'stalk-surface-above-ring_k'
 'stalk-surface-below-ring_k']


In [2]:
print("Original space:",x.shape)
pca = decomposition.PCA(n_components=5)
pca.fit(x)
xpca = pca.transform(x)

print("PCA space:",xpca.shape)

Original space: (8124, 117)
PCA space: (8124, 5)


In [3]:
#Shows the correlation between the components and the different features
comp_correlation = pd.DataFrame(pca.components_,columns=x.columns,index = ['PC-1','PC-2','PC-3', 'PC-4', 'PC-5']) 
comp_correlation


Unnamed: 0,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,cap-surface_y,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
PC-1,-0.029239,-4e-06,-0.005119,0.079135,-0.001267,-0.043506,-0.070359,-0.00024,0.040217,0.030381,...,-0.06982,0.173841,-0.062865,-0.075733,-0.052964,0.078021,-0.027595,0.101057,-0.019234,-0.003553
PC-2,-0.007405,-0.000154,0.042847,-0.066132,-0.001192,0.032036,0.105681,-0.000247,-0.146942,0.041508,...,-0.035995,-0.067724,0.145663,0.024108,0.02891,-0.065976,-0.004061,0.036701,-0.008433,-0.011249
PC-3,0.050045,0.000311,-0.064406,-0.005018,0.002253,0.016816,0.00751,0.000251,0.14758,-0.155342,...,0.173549,-0.211641,-0.105853,-0.312478,0.29945,-0.017015,0.021143,-0.014923,0.016183,0.007639
PC-4,-0.094813,-0.000455,0.15531,-0.040607,-0.001064,-0.018371,0.253728,-0.000552,-0.068041,-0.185135,...,-0.022415,-0.023204,0.060855,0.131758,0.105828,-0.039563,-0.083419,-0.048152,-0.032845,-0.033606
PC-5,0.054887,0.0008,0.127355,0.108533,-0.000244,-0.291329,0.041174,0.000537,0.000585,-0.042296,...,-0.107781,-0.142244,0.061506,-0.040967,-0.034713,0.112639,-0.024657,-0.035432,-0.073084,0.096215


In [4]:
dummy_labels = x.columns
relevant_features = []

"""
Finds the feature in each component with the highest absolute value.
Higest value = the highest correlation = most important feature 
"""
for i in range(len(comp_correlation.to_numpy())):
    relevant_features.append(dummy_labels[np.argmax(np.abs(comp_correlation.to_numpy()[i]))])

print(relevant_features)


['ring-type_p', 'spore-print-color_h', 'habitat_d', 'stalk-shape_t', 'odor_n']


In [5]:
#Checks the overlapping features after each demension reduction
list(set(relevant_features) & set(new_features)) 

['odor_n']