# FILTER METHODS

In [3]:
import pandas
from pandas import read_csv

filename = 'pima-indians-diabetes.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)

In [None]:
##Syntax: DataFrame.corr(self, method=’pearson’, min_periods=1)

#Parameters:
#method :
#pearson : standard correlation coefficient
#kendall : Kendall Tau correlation coefficient
#spearman : Spearman rank correlation
#min_periods : Minimum number of observations required per pair of columns to have a valid result. Currently only available for pearson and spearman correlation

#Returns: count :y : DataFrame

In [5]:
# To find the correlation among
# the columns using pearson method
dataframe.corr(method ='pearson')

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
preg,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341,0.221898
plas,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,0.466581
pres,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528,0.065068
skin,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,0.074752
test,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163,0.130548
mass,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242,0.292695
pedi,-0.033523,0.137337,0.041265,0.183928,0.185071,0.140647,1.0,0.033561,0.173844
age,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.036242,0.033561,1.0,0.238356
class,0.221898,0.466581,0.065068,0.074752,0.130548,0.292695,0.173844,0.238356,1.0


In [8]:
# To find the correlation among
# the columns using kendall method
dataframe.corr(method ='kendall')

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
preg,1.0,0.091323,0.13544,-0.064401,-0.096417,0.004183,-0.029959,0.458272,0.17037
plas,0.091323,1.0,0.159961,0.039046,0.163645,0.155862,0.061871,0.19651,0.390565
pres,0.13544,0.159961,1.0,0.094868,-0.003682,0.205222,0.019448,0.246056,0.119206
skin,-0.064401,0.039046,0.094868,1.0,0.420066,0.331532,0.126457,-0.044754,0.076297
test,-0.096417,0.163645,-0.003682,0.420066,1.0,0.141587,0.161652,-0.080176,0.058531
mass,0.004183,0.155862,0.205222,0.331532,0.141587,1.0,0.094644,0.088678,0.253676
pedi,-0.029959,0.061871,0.019448,0.126457,0.161652,0.094644,1.0,0.028042,0.143359
age,0.458272,0.19651,0.246056,-0.044754,-0.080176,0.088678,0.028042,1.0,0.257363
class,0.17037,0.390565,0.119206,0.076297,0.058531,0.253676,0.143359,0.257363,1.0


In [10]:
# To find the correlation among
# the columns using kendall method
dataframe.corr(method ='spearman')

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
preg,1.0,0.130734,0.185127,-0.085222,-0.126723,0.000132,-0.043242,0.607216,0.198689
plas,0.130734,1.0,0.235191,0.060022,0.213206,0.231141,0.091293,0.285045,0.475776
pres,0.185127,0.235191,1.0,0.126486,-0.006771,0.29287,0.030046,0.350895,0.142921
skin,-0.085222,0.060022,0.126486,1.0,0.541,0.443615,0.18039,-0.066795,0.089728
test,-0.126723,0.213206,-0.006771,0.541,1.0,0.192726,0.22115,-0.114213,0.066472
mass,0.000132,0.231141,0.29287,0.443615,0.192726,1.0,0.141192,0.131186,0.309707
pedi,-0.043242,0.091293,0.030046,0.18039,0.22115,0.141192,1.0,0.042909,0.175353
age,0.607216,0.285045,0.350895,-0.066795,-0.114213,0.131186,0.042909,1.0,0.30904
class,0.198689,0.475776,0.142921,0.089728,0.066472,0.309707,0.175353,0.30904,1.0


# LDA(Linear discriminant analysis) and PCA (Principal component analysis )

In [11]:
import numpy as np
import pandas as pd

filename = 'pima-indians-diabetes.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)

In [16]:
X = dataframe.iloc[:, 0:8].values
y = dataframe.iloc[:, 8].values

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [21]:
# Feature scaling
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### In the script below the LinearDiscriminantAnalysis class is imported as LDA. Like PCA, we have to pass the value for the n_components parameter of the LDA, which refers to the number of linear discriminates that we want to retrieve. In this case we set the n_components to 1, since we first want to check the performance of our classifier with a single linear discriminant. Finally we execute the fit and transform methods to actually retrieve the linear discriminants.

In [23]:
# Perform LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

lda = LDA(n_components=1)
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)

In [24]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(max_depth=2, random_state=0)

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [34]:
# Evaluating the results
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy ' + str(accuracy_score(y_test, y_pred)))

[[100   7]
 [ 20  27]]
Accuracy 0.8246753246753247


In [35]:
import numpy as np
import pandas as pd

filename = 'pima-indians-diabetes.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)

In [36]:
X = dataframe.iloc[:, 0:8].values
y = dataframe.iloc[:, 8].values

In [37]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [38]:
## Let's implement PCA

from sklearn.decomposition import PCA

pca = PCA()
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [42]:
explained_variance = pca.explained_variance_ratio_
explained_variance

array([8.91422428e-01, 5.93570017e-02, 2.54509913e-02, 1.31722602e-02,
       7.16860726e-03, 2.90131295e-03, 5.20782357e-04, 6.61594705e-06])

In [40]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(max_depth=2, random_state=0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [41]:
## Evaluate PCA
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy ' + str(accuracy_score(y_test, y_pred)))

[[105   2]
 [ 35  12]]
Accuracy 0.7597402597402597
