In [1]:
%matplotlib inline

#### Here are several methods scikit-learn has of ranking features

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_regression, mutual_info_regression

#### load the data

In [2]:
data_loc = "D:\\Data\\Python\\"
filenm = "winequality-red.csv"

df = pd.read_csv(data_loc+filenm, delimiter=';')
cols = df.columns.tolist()

#### X holds the features; Y the response

In [3]:
array = df.values

X = array[:,0:11]
Y = array[:,11]

#### run the F-test and Mutual Information

In [5]:
F, pval = f_regression(X,Y)
mi = mutual_info_regression(X, Y)
# save the results
results_F = list(zip(F,cols))
results_F.sort(reverse=True)        # 1st entry has highest F statistic

list

#### Display

In [6]:
plt.close('all')

plt.figure(figsize=(10,10))

for i in range(0,11):
        sub = plt.subplot(4, 3, i+1)
        sub.scatter(X[:,i], Y)
        sub.set_title('{}  F: {:.2f}'.format(cols[i], F[i]), fontsize=10)
        sub.set_ylabel('Quality', fontsize=10)

plt.tight_layout()
plt.savefig('tom.png')
plt.show()

#### Now try Chi-squared

In [7]:
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)

In [8]:
mask = fit.get_support()                # What fields did it select?
fields = [cols[i] for i in range(len(cols)-1) if mask[i]]
ChiSq = [fit.scores_[i] for i in range(len(cols)-1) if mask[i]]
pval = [fit.pvalues_[i] for i in range(len(cols)-1) if mask[i]]

In [9]:
# 1st entry has highest X-Squared statistic
results_X = list(zip(ChiSq, fields))
results_X.sort(reverse=True)

In [10]:
print("{:<23}{:<20}".format('Attribute', 'Chi-squared'))
for x in results_X:
    print('{:<27}{:<15.0f}'.format(x[1],x[0]))

Attribute              Chi-squared         
total sulfur dioxide       2756           
free sulfur dioxide        162            
alcohol                    46             
volatile acidity           16             


#### Now print them side-by-side

In [25]:
print("{:<19}{:<23}{:<23}{:<20}".format('Attribute','F-Statistic', 'Attribute', 'Chi-squared'))
for x in range(len(results_X)):
    print('{:<22}{:<20.0f}{:<25}{:<15.0f}'
          .format(results_F[x][1],results_F[x][0],results_X[x][1],results_X[x][0]))

Attribute          F-Statistic            Attribute              Chi-squared         
alcohol               468                 total sulfur dioxide     2756           
volatile acidity      287                 free sulfur dioxide      162            
sulphates             108                 alcohol                  46             
citric acid           86                  volatile acidity         16             
